## Kaggle
##### https://www.kaggle.com/c/postsold-prediction

### Load data

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
raw_data = pd.read_csv('./data/train.tsv', sep = '\t')
preprocessed_data = raw_data.copy()

In [None]:
raw_data.head()

### Preprocess data

In [None]:
useless_columns = [raw_data.columns[0], 'date_created', 'name_text', 'owner_id', 'product_id', 'lat', 'long']

In [None]:
target_feature = 'sold_fast'

In [None]:
real_features = ['price']

In [None]:
cat_features = ['category_id', 'img_num', 'city'
                'product_type', 'region', 'sold_mode', 'subcategory_id']

In [None]:
text_features = ['desc_text', 'properties']

In [None]:
data_features = ['date_created']

In [None]:
bool_features = ['delivery_available', 'payment_available']

In [None]:
features_to_encode = ['region', 'category_id', 'img_num', 'city', 'subcategory_id']

In [None]:
def swapColumns():
    cols = list(data.columns[data.columns != target_feature])
    if target_feature in data:
        cols.append(target_feature)
    return data[cols]

In [None]:
def resetData():   
    global data
    data = preprocessed_data.drop(useless_columns, axis = 1)
#     data = data.drop('properties', axis = 1)
    data = data.drop(text_features, axis = 1)
    data = swapColumns()
    
resetData()

In [None]:
def preprocessRealFeatures():
    pass

In [None]:
def preprocessCatFeatures():
    preprocessCategoryID()
    preprocessProductType()
    for feature in features_to_encode:
        encodeCatFeature(feature)

In [None]:
def preprocessCategoryID():
    col = preprocessed_data['category_id'].values
    col[col == 100] = 0 
    
def preprocessProductType():
    col = preprocessed_data['product_type'].values
    col[col != 1] = 0

In [None]:
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

# def createDictionary(feature):
#     col = raw_data[feature].values
#     labels = raw_data[target_feature].values
#     cat_dict = defaultdict(int)
#     for aval in np.unique(col):
#         labels_for_val = labels[col == aval]
#         n = len(labels_for_val)
#         n_1 = len(labels_for_val[labels_for_val == 1])
#         p = 1.0 * n_1 / n
#         cat_dict[aval] = p
#     return cat_dict

def createDictionary(feature):
    values = defaultdict(int)
    counts = Counter()
    col = preprocessed_data[feature].values
    target = preprocessed_data[target_feature].values
    

    X_train, _, y_train, _ = train_test_split(col, target, 
                                                    test_size=0.5,    
                                                    random_state=4010)

    avg = len(target[target == 1]) / len(target)
     
    for i in range(len(X_train)):
        values[X_train[i]] += y_train[i]
        counts[X_train[i]] += 1
    
    
    mean_values = dict()
    for val in values:
        if(counts[val] > 10):
            mean_values[val] = values[val] / counts[val]
        else:
            mean_values[val] = avg
    return mean_values

In [None]:
X_cols = data.columns[data.columns != target_feature]
X = data[X_cols].values
y = data[target_feature].values

In [None]:
dicts = dict()
for feature in features_to_encode:
    feature_dict = createDictionary(feature)
    dicts[feature] = feature_dict

In [None]:
# def encodeCatFeature(feature): 
#     cat_dict = dicts[feature]
#     col = preprocessed_data[feature].values
    
#     for i in range(len(col)):
#         aval = col[i]
#         if aval in cat_dict:
#             col[i] = cat_dict[aval] * 100
#         else:
#             col[i] = 0.0

def encodeCatFeature(feature):         
    mean_values = dicts[feature]
    col = preprocessed_data[feature].values
    
    minKey = min(mean_values, key = mean_values.get)
    minVal = mean_values[minKey]
    maxKey = max(mean_values, key = mean_values.get)
    maxVal = mean_values[maxKey]
    
    for val in mean_values:
        normVal = (mean_values[val] - minVal) / (maxVal - minVal)
        mean_values[val] = round(normVal / 0.01)
        
    for i in range(len(col)):
        if(col[i] in mean_values.keys()):
            col[i] = mean_values[col[i]]
        else:
            col[i] = 0

#### Text processing

In [None]:
from tqdm import tqdm_notebook

import nltk
nltk.download('stopwords')
import string

word_tokenizer = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('russian')

from nltk.stem.snowball import SnowballStemmer 
stemmer = SnowballStemmer("russian")

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
    
def _processText(data):
    texts = []
    for item in data['desc_text']:
        tokens = word_tokenizer.tokenize(item.lower())
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words)]
        texts.append(tokens)
    return texts

def preprocessTextFeatures():
    texts = _processText(raw_data)
        
    for i in tqdm_notebook(range(len(texts))):
        texts[i] = ' '.join(list(map(stemmer.stem, texts[i])))
        
    texts_head = texts[:100]
#     vectorizer = TfidfVectorizer()
#     X = vectorizer.fit_transform(texts_head)
#     vectorizer.get_feature_names()
    
    vectorizer = TfidfVectorizer(min_df = 0.2) #filter all word which appear less than 20% of documents
    X = vectorizer.fit_transform(texts_head)
    vectorizer.get_feature_names()
    
#     vectorizer = TfidfVectorizer(min_df = 0.2, max_df = 0.95) #filter all word which appear less than 20% and great than 95% of documents
#     X = vectorizer.fit_transform(texts_head)
#     vectorizer.get_feature_names()

#     vectorizer = TfidfVectorizer(max_features = 25) #only top-25 words ordered by tf
#     X = vectorizer.fit_transform(texts_head)
#     vectorizer.get_feature_names()

    vectorizer = TfidfVectorizer(max_features = 40000)
    X = vectorizer.fit_transform(texts)
        
    svd = TruncatedSVD(n_components = 1000)
    X_small = svd.fit_transform(X)
#     preprocessed_data['desc_text'] = _processText(raw_data)
    return X_small
    

In [None]:
import datetime

def preprocessDataFeatures():
    col = raw_data['date_created'].values
    years = []
    monthes = []
    days = []
    isWeekend = []
    
    dates = list(col)
    for i in range(len(dates)):
        date = dates[i]
        date_splitted = date.split('-')
        
        month = int(date_splitted[1])
        monthes.append(month)
        day = int(date_splitted[2])
        days.append(day)
    
        dayNum = datetime.datetime.strptime(date, '%Y-%m-%d').weekday()
        isWeekend.append(int(dayNum >= 4))
        
    preprocessed_data['month'] = monthes
    preprocessed_data['day'] = days
    preprocessed_data['is_weekend'] = isWeekend

In [None]:
def preprocessBooleanFeatures():
    for feature in bool_features:
        preprocessed_data[feature] = preprocessed_data[feature].astype(int)

In [None]:
def preprocess():
    preprocessRealFeatures()
    preprocessCatFeatures()
#     preprocessTextFeatures()
    preprocessDataFeatures()
    preprocessBooleanFeatures()
    
preprocess()

In [None]:
resetData()
data.head()

In [None]:
# OneHotEncoding
from sklearn.preprocessing import OneHotEncoder

ohe_category_id = OneHotEncoder(handle_unknown='ignore')
ohe_region_id = OneHotEncoder(handle_unknown='ignore')
ohe_img_num_id = OneHotEncoder(handle_unknown='ignore')

ohe_category_id.fit(data['category_id'].values.reshape(-1, 1))
ohe_region_id.fit(data['region'].values.reshape(-1, 1))
ohe_img_num_id.fit(data['img_num'].values.reshape(-1, 1))

In [None]:
extra_cols = ['lat', 'long', 'category_id']
extra_data = raw_data[extra_cols].values

category_id = ohe_category_id.transform(raw_data['category_id'].values.reshape(-1, 1)).toarray()
region = ohe_category_id.transform(raw_data['region'].values.reshape(-1, 1)).toarray()
img_num = ohe_category_id.transform(raw_data['img_num'].values.reshape(-1, 1)).toarray()

ignore_features = [target_feature]
X_cols = []
for col in data.columns:
    if (not col in ignore_features):
        X_cols.append(col)

X = data[X_cols].values
# X = np.concatenate(tuple([X, extra_data, category_id, region, img_num]), axis=1)
X = np.concatenate(tuple([X]), axis=1)
y = data[target_feature].values

### Fit-Predict

Always shuffle your data and don't forget fix random_seed and random_state

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.7,      # 20% for test, 80% for train
                                                    random_state=72)  # shuffle objects before split

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.DataFrame(data=X_train[0:,0:]).head()

In [None]:
# non-ml approach
# from sklearn.naive_bayes import GaussianNB
# model = GaussianNB()

from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=800, learning_rate=0.03, max_depth=3, n_jobs=14, 
                        colsample_bytree=0.6, scale_pos_weight=3.33, silent=0)

model.fit(X_train, y_train)

In [None]:
# #Важность фич кэтбустом
# from catboost import CatBoostClassifier, Pool, cv

# X_train2, X_validation2, y_train2, y_validation2 = train_test_split(X_train, y_train, train_size=0.75, random_state=42)

# params = {
#     'iterations': 500,
#     'learning_rate': 0.1,
#     'eval_metric': 'auc',
#     'random_seed': 42,
#     'logging_level': 'Silent',
#     'use_best_model': False,
#     'task_type' : 'GPU'
# }
# train_pool = Pool(X_train2, y_train2)
# validate_pool = Pool(X_validation2, y_validation2)

# model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
# feature_importances = model.get_feature_importance(train_pool)
# feature_names = data.columns
# for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
#     print('{}: {}'.format(name, score))

In [None]:
from sklearn.metrics import roc_auc_score

train_proba = model.predict_proba(X_train)[:, 1]
test_proba = model.predict_proba(X_test)[:, 1]

auc_train = roc_auc_score(y_train, train_proba)
auc_test = roc_auc_score(y_test, test_proba)

print("Train AUC: ", auc_train)
print("Test AUC:  ", auc_test)

### Visualisation

In [None]:
def drawFeatureHist(features):
    # Define size of the figure
    plt.figure(figsize=(16, 16))
    for feature in features:

        # Create subplot for each column
        plt.subplot(4, 3, features.index(feature)+1)

        # Get column and label values
        x_col = data[feature].values
        y_col = data[target_feature].values
        
        # Plot histograms
        bins = 25
        plt.hist(x_col[y_col == 0], bins=bins, color = 'r', alpha=0.5, label='0')
        plt.hist(x_col[y_col == 1], bins=bins, color = 'b', alpha=0.5, label='1')

        # Labels and legend
        plt.xlabel(feature)
        plt.ylabel('Counts')
        plt.legend(loc='best')

    plt.show()

# drawFeatureHist(list(data.columns))

### Evaluate on test_data

In [None]:
raw_data = pd.read_csv('./data/test_nolabel.tsv', sep = '\t')
preprocessed_data = raw_data.copy()
preprocess()
resetData()

In [None]:
X_predict = data.values
data.head()

In [None]:
proba = model.predict_proba(X_predict)

### save file for submit

In [None]:
product_id = raw_data['product_id'].values
data = pd.DataFrame.from_dict({'product_id' : product_id, 'score' : proba[:, 1]})
data.to_csv('./Submission/to_submit', sep = ',', index = False)