In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pandas_profiling import ProfileReport

In [None]:
train = pd.read_csv('../input/productpurchasehistory/Training Data.csv', index_col='user_id')
train_target = pd.read_csv('../input/productpurchasehistory/Training Data Target.csv', index_col='user_id')
test = pd.read_csv('../input/productpurchasehistory/Test Data.csv', index_col='user_id')
new_train = pd.read_csv('../input/productpurchasehistory/new_train.csv', index_col='user_id')
new_test = pd.read_csv('../input/productpurchasehistory/new_test.csv', index_col='user_id')

In [None]:
train_target = train_target.drop(columns='aov')
test.drop(columns='Unnamed: 0', inplace=True)

In [None]:
target_features = train['category'].unique()
train_users = train.index.unique()

In [None]:
def process(df):
    new = pd.DataFrame(index = df.index.unique(), 
                             columns = [str(name) + " AOV" for name in target_features]
                            + [str(name) + " Freq" for name in target_features])

    new = new.fillna(0)

    for index, row in df.iterrows():
        new.loc[new.index == index, row['category'] + " AOV"] += row['aov']
        new.loc[new.index == index, row['category'] + " Freq"] += 1
        
    return new

In [None]:
train_report = ProfileReport(new_train, title="Training Data Profiling Report")
train_report.to_file("data_report.html")
train_report

In [None]:
test_report = ProfileReport(new_test, title="Test Data Profiling Report")
test_report.to_file("tt_data_report.html")
test_report

In [None]:
def box_plot(dataset, column):
    sns.boxplot(x=dataset[column])

def remove_outliers(dataset, column, target=None, train=False):
    Q1 = np.percentile(dataset[column], 25, interpolation = 'midpoint')
    Q2 = np.percentile(dataset[column], 50, interpolation = 'midpoint')
    Q3 = np.percentile(dataset[column], 75, interpolation = 'midpoint')

    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    up_lim = Q3 + 1.5 * IQR
    
    
    print(column)
    print('low_limit is', low_lim)
    print('up_limit is', up_lim)

    outliers = dataset.loc[(dataset[column] > up_lim) | (dataset[column] < low_lim)]
    
    print('Number of columns dropping: ', len(outliers.index))
    print('--------------------')
    
    dataset.drop(outliers.index, axis = 0, inplace = True)
    if (train):
        target.drop(outliers.index, axis = 0, inplace = True, errors='ignore')

In [None]:
from sklearn.preprocessing import StandardScaler

#For continous values
std_scaler = StandardScaler()

tot = new_train.append(new_test)
std_scaler.fit(tot)
new_train[new_train.columns] = std_scaler.transform(new_train)
new_test[new_test.columns] = std_scaler.transform(new_test)

In [None]:
X = new_train.loc[train_target.index]
other_data = new_train[~new_train.index.isin(train_target.index)]
y = train_target['category']

In [None]:
def plot_target_dist(target_y):
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.countplot(x=target_y, ax=ax)
    plt.xticks(rotation = 45)

    for p in ax.patches:
            ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.05, p.get_height()+50))

In [None]:
plot_target_dist(y)

In [None]:
# for column in ['Phones AOV']:
#     remove_outliers(X, column, y, True)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder

In [None]:
le = LabelEncoder()
le.fit(target_features.reshape(-1,1))
y = le.transform(y.to_numpy().reshape(-1,1))
y

In [None]:
# mee = MEstimateEncoder()
# y = mee.fit_transform(y, X.index)

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, random_state = 10, test_size = 0.2, shuffle=True, stratify=y)

In [None]:
plot_target_dist(y_temp)

In [None]:
from collections import Counter
counter = Counter(y_temp)
counter1 = Counter(y)

undersampling_dict = counter.copy()
final_us_dict = counter1.copy()

final_us_dict[15] = 5000
undersampling_dict[15] = 3000

oversampling_dict = counter.copy()
final_os_dict = counter1.copy()

# oversampling_dict[0] = oversampling_dict[8] = oversampling_dict[9] = oversampling_dict[11] = 50
# oversampling_dict[17] = 300
# oversampling_dict[17] = 300

indices = [6,14,7,4,5,16,12,10,8,17,0,9,11]

for index in indices:
    oversampling_dict[index] = int(oversampling_dict[index] * 2.5) 
    final_os_dict[index] = int(final_os_dict[index] * 2.5)

final_os_dict

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler


def sample_data(X_tbs, y_tbs, oversampling_dict, undersampling_dict):
    over = RandomOverSampler(sampling_strategy=oversampling_dict)
    under = RandomUnderSampler(sampling_strategy=undersampling_dict)

    # define pipeline for oversampling and undersampling
    # pipeline = Pipeline(steps=[('o', over), ('u', under)])

    return over.fit_resample(X_tbs, y_tbs)

X_sampled, y_sampled = sample_data(X_temp, y_temp, oversampling_dict, undersampling_dict)

In [None]:
plot_target_dist(y_sampled)

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from skmultilearn.adapt import MLkNN

# evaluate a model using repeated k-fold cross-validation
def evaluate_model(model, X, y):
    results = list()
    
    cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
    
    for train_ix, test_ix in cv.split(X):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]

        model.fit(X_train, y_train)
        yhat = model.predict_proba(X_test)

        print("actual:", y_test)
        print("predicted:", yhat)
        acc = accuracy_score(y_test, yhat)

        print('>%.3f' % acc)
        results.append(acc)
        
    print('mean',np.mean(results))
    print('std', np.std(results))

In [None]:
from sklearn.utils import class_weight
from collections import Counter

temp = np.unique(y)

cw1 = class_weight.compute_class_weight('balanced',np.unique(y),y)
cw1 = dict(zip(temp, cw1))

cw2 = dict()
counter = Counter(y)
for k,v in counter.items():
    cw2[k] = v
    

def class_weight(labels_dict, mu=0.15):
    total = len(y)
    keys = labels_dict.keys()
    weight = dict()

    for i in keys:
        score = np.log(mu*total/labels_dict[i])
        weight[i] = score if score > 1 else 1
        
    return weight

cw3 = class_weight(cw2)
cw2 = dict((i, cw2[i] / len(y)) for i in cw2.keys())

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier,VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
model = VotingClassifier(estimators = [('xgb', XGBClassifier()), ('dtc', DecisionTreeClassifier()), ('rfc',RandomForestClassifier())])
xgb = XGBClassifier()
evaluate_model(xgb, X.to_numpy(), y)

In [None]:
nb_clf = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
lr = LogisticRegression()
mn = MultinomialNB()

for classifier in [nb_clf, sgd, lr, mn]:
    evaluate_model(classifier, X.to_numpy(), y)

In [None]:
cat = CatBoostClassifier(iterations=5000)
cat.fit(X_sampled,y_sampled, verbose=1000)

In [None]:
accuracy_score(y_test, cat.predict(X_test))

In [None]:
X_final, y_final = sample_data(X, y, final_os_dict, final_us_dict)

xgb = XGBClassifier()
xgb.fit(X_final, y_final)
y_pred = xgb.predict_proba(X_test)
y_pred

In [None]:
from sklearn.metrics import auc, roc_curve, roc_auc_score

def one_vs_rest(i):
    y_test_i = np.where(y_test == temp[i], 1, 0)
    y_pred_i = y_pred[:,i]
    
    print('Predicting for class: ', i)
#     print('Predicted Ones: ', np.count_nonzero(y_pred_i == 1))
    print(y_pred_i)
    print('Actual Ones: ', np.count_nonzero(y_test_i == 1))
    
    if (np.count_nonzero(y_test_i == 1) == 0):
        y_test_i[0] = 1
    
    return y_test_i, y_pred_i

fpr = dict()
tpr = dict()
threshold = dict()
roc_auc = dict()

for i in range(len(temp)):
    
    y_ovr, y_pred_ovr = one_vs_rest(i)
    fpr[i], tpr[i], threshold[i] = roc_curve(y_ovr, y_pred_ovr)
    
#     print(y_ovr)
    
    roc_auc_score_ = []
    for thres in threshold[i]:
        roc_auc_score_.append(roc_auc_score(y_ovr, np.where(y_pred_ovr>thres,1,0)))
    
    roc_auc[i] = roc_auc_score_

In [None]:
best_threshold = []
for i in range(len(temp)):
    index = np.argmax(roc_auc[i])
    best_threshold.append(threshold[i][index])
    print('Max ROC AUC for ', i)
    print('Threshold: ', threshold[i][index])
    print('ROC AUC score: ', roc_auc[i][index])
    print('---------------------------------')

In [None]:
xgb.fit(X_final, y_final)
y_pred_ans = xgb.predict_proba(new_test.to_numpy())
# accuracy_score(y_test, y_pred)
y_pred_ans

In [None]:
for i in range(y_pred_ans.shape[0]):
    y_pred_ans[i] = y_pred_ans[i] - best_threshold

In [None]:
y_pred_ans

In [None]:
# X_final, y_final = sample_data(X, y, final_os_dict, final_us_dict)

# xgb.fit(X_final, y_final)
y_pred_gen = xgb.predict(other_data)
# accuracy_score(y_test, y_pred)
y_pred_gen

In [None]:
plt.hist(y_pred_gen)

In [None]:
gen_train = pd.DataFrame({'category': le.inverse_transform(y_pred_gen)}, index = other_data.index)
gen_train = gen_train.loc[gen_train['category'] != 'Phones']
gen_train

In [None]:
comb_X = other_data.loc[gen_train.index]
comb_y = le.transform(gen_train['category'])

In [None]:
X = pd.concat([X, comb_X])
y = np.concatenate((y,comb_y))
y.shape

In [None]:
y_pred_proba = y_pred_ans
pred_cat = []
for i in range(len(y_pred_proba)):
    pred_cat.append(le.inverse_transform(temp[y_pred_proba[i].argsort()[-3:][::-1]]))
    
formatted_pred = [str(pred_cat[i][0]) + ", " + str(pred_cat[i][1]) + ", " + str(pred_cat[i][2]) for i in range(len(pred_cat))]

In [None]:
output = pd.DataFrame({'user_id': new_test.index, 'pred3': formatted_pred})
output.to_csv('basic-xgb-gen6.csv', index=False)