In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

In [2]:
train = pd.read_csv("train_8wry4cB.csv")
test = pd.read_csv("test_Yix80N0.csv")

In [3]:
train.head()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male


In [4]:
train.shape

(10500, 5)

In [5]:
test.head()

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/
2,u11795,01/12/14 10:44,01/12/14 10:44,A00002/B00002/C00004/D12538/
3,u22639,08/12/14 20:19,08/12/14 20:22,A00002/B00003/C00079/D22781/;A00002/B00003/C00...
4,u18034,15/12/14 19:33,15/12/14 19:33,A00002/B00001/C00010/D23419/


In [6]:
test.shape

(4500, 4)

In [7]:
type(train)

pandas.core.frame.DataFrame

In [8]:
train.size

52500

In [9]:
train.tail()

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
10495,u15442,18/11/14 7:39,18/11/14 7:42,A00002/B00002/C00007/D06407/;A00002/B00002/C00...,female
10496,u17986,25/11/14 15:16,25/11/14 15:16,A00006/B00030/C00334/D11660/,female
10497,u22508,09/12/14 10:11,09/12/14 10:11,A00002/B00002/C00007/D18028/,female
10498,u17087,22/11/14 11:27,22/11/14 11:27,A00003/B00012/C00131/D09453/;A00003/B00012/C00...,female
10499,u23137,19/12/14 3:11,19/12/14 3:19,A00002/B00001/C00010/D02309/;A00002/B00002/C00...,female


In [10]:
train['ProductList'][0]

'A00002/B00003/C00006/D28435/;A00002/B00003/C00006/D02554/;A00002/B00003/C00006/D28436/;A00002/B00003/C00006/D28437/'

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 5 columns):
session_id     10500 non-null object
startTime      10500 non-null object
endTime        10500 non-null object
ProductList    10500 non-null object
gender         10500 non-null object
dtypes: object(5)
memory usage: 410.3+ KB


In [12]:
train['gender'].value_counts()

female    8192
male      2308
Name: gender, dtype: int64

In [13]:
train.isnull().sum()

session_id     0
startTime      0
endTime        0
ProductList    0
gender         0
dtype: int64

In [14]:
#Custom function for extract infor from product list
def extract_product(str):
    if ";"  in str:
        prd_lst = str.split(";")
        count_item = len(prd_lst)
        first_lv1 = prd_lst[0].split("/")[0]
        first_lv2 = prd_lst[0].split("/")[1]
        lv1_lst =[]
        lv2_lst =[]
        for item in prd_lst:
            lv1_lst.append(item.split("/")[0])
            lv2_lst.append(item.split("/")[1])
        unique_lv1 = len(set(lv1_lst))
        unique_lv2 = len(set(lv2_lst))
        most_freq_lv1 =  max(lv1_lst, key=Counter(lv1_lst).get)        
    else:
        lv_lst = str.split("/")
        first_lv1 = lv_lst[0]
        first_lv2 = lv_lst[1]
        count_item = 1
        unique_lv1 = 1
        unique_lv2 = 1
        most_freq_lv1 = first_lv1
    return (count_item,first_lv1,first_lv2,unique_lv1,unique_lv2,most_freq_lv1)

In [15]:
#Feature Extraction :
new_col = ('NumProduct','FirstA','FirstB','UniqueA','UniqueB','MostA')      
new_col_lst = train['ProductList'].apply(lambda x: extract_product(x))    
new_col_df = pd.DataFrame(new_col_lst.tolist(),columns =new_col)

In [16]:
data = pd.concat([train, new_col_df], axis=1)

In [17]:
# Time feature extraction
data['startTime'] = pd.to_datetime(data['startTime'])
data['endTime'] = pd.to_datetime(data['endTime'])
data['duration'] = data['endTime'] - data['startTime']
data['duration'] = data['duration'].astype('timedelta64[m]')
data['weekday'] = data['startTime'].dt.dayofweek
data['hour_24h'] = data['startTime'].dt.hour

In [18]:
drop_lst = ['session_id', 'startTime', 'endTime', 'ProductList']

data = data.drop(drop_lst,axis =1 )

In [19]:
data.head()

Unnamed: 0,gender,NumProduct,FirstA,FirstB,UniqueA,UniqueB,MostA,duration,weekday,hour_24h
0,female,4,A00002,B00003,1,1,A00002,1.0,0,18
1,male,7,A00001,B00009,1,1,A00001,6.0,1,14
2,female,1,A00002,B00001,1,1,A00002,0.0,6,15
3,female,3,A00002,B00004,1,1,A00002,3.0,6,2
4,male,2,A00001,B00001,1,1,A00001,2.0,2,16


In [20]:
data = pd.get_dummies(data)

In [22]:
#from sklearn.preprocessing import LabelEncoder
#number = LabelEncoder()
#data['gender'] = number.fit_transform(data['gender'].astype('str'))
#data['FirstA'] = number.fit_transform(data['FirstA'].astype('str'))
#data['FirstB'] = number.fit_transform(data['FirstB'].astype('str'))
#data['MostA'] = number.fit_transform(data['MostA'].astype('str'))

In [21]:
data.head()

Unnamed: 0,NumProduct,UniqueA,UniqueB,duration,weekday,hour_24h,gender_female,gender_male,FirstA_A00001,FirstA_A00002,...,MostA_A00002,MostA_A00003,MostA_A00004,MostA_A00005,MostA_A00006,MostA_A00007,MostA_A00008,MostA_A00009,MostA_A00010,MostA_A00011
0,4,1,1,1.0,0,18,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1,7,1,1,6.0,1,14,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0.0,6,15,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,3,1,1,3.0,6,2,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,2,1,1,2.0,2,16,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
test.isnull().sum().sum()

0

In [23]:
#Custom function for extract infor from product list
def extract_product(str):
    if ";"  in str:
        prd_lst = str.split(";")
        count_item = len(prd_lst)
        first_lv1 = prd_lst[0].split("/")[0]
        first_lv2 = prd_lst[0].split("/")[1]
        lv1_lst =[]
        lv2_lst =[]
        for item in prd_lst:
            lv1_lst.append(item.split("/")[0])
            lv2_lst.append(item.split("/")[1])
        unique_lv1 = len(set(lv1_lst))
        unique_lv2 = len(set(lv2_lst))
        most_freq_lv1 =  max(lv1_lst, key=Counter(lv1_lst).get)        
    else:
        lv_lst = str.split("/")
        first_lv1 = lv_lst[0]
        first_lv2 = lv_lst[1]
        count_item = 1
        unique_lv1 = 1
        unique_lv2 = 1
        most_freq_lv1 = first_lv1
    return (count_item,first_lv1,first_lv2,unique_lv1,unique_lv2,most_freq_lv1)

In [24]:
#Feature Extraction :
new_col = ('NumProduct','FirstA','FirstB','UniqueA','UniqueB','MostA')      
new_col_lst = test['ProductList'].apply(lambda x: extract_product(x))    
new_col_df = pd.DataFrame(new_col_lst.tolist(),columns =new_col)

In [25]:
test_data = pd.concat([test, new_col_df], axis=1)

In [26]:
# Time feature extraction
test_data['startTime'] = pd.to_datetime(test_data['startTime'])
test_data['endTime'] = pd.to_datetime(test_data['endTime'])
test_data['duration'] = test_data['endTime'] - test_data['startTime']
test_data['duration'] = test_data['duration'].astype('timedelta64[m]')
test_data['weekday'] = test_data['startTime'].dt.dayofweek
test_data['hour_24h'] = test_data['startTime'].dt.hour

In [27]:
drop_lst = ['session_id', 'startTime', 'endTime', 'ProductList']

test_data = test_data.drop(drop_lst,axis =1 )

In [28]:
test_data.head()

Unnamed: 0,NumProduct,FirstA,FirstB,UniqueA,UniqueB,MostA,duration,weekday,hour_24h
0,1,A00002,B00003,1,1,A00002,0.0,1,13
1,1,A00002,B00005,1,1,A00002,0.0,4,13
2,1,A00002,B00002,1,1,A00002,0.0,6,10
3,4,A00002,B00003,1,1,A00002,3.0,1,20
4,1,A00002,B00001,1,1,A00002,0.0,0,19


In [31]:
#from sklearn.preprocessing import LabelEncoder
#number = LabelEncoder()
#data['gender'] = number.fit_transform(data['gender'].astype('str'))
#test_data['FirstA'] = number.fit_transform(test_data['FirstA'].astype('str'))
#test_data['FirstB'] = number.fit_transform(test_data['FirstB'].astype('str'))
#test_data['MostA'] = number.fit_transform(test_data['MostA'].astype('str'))

In [29]:
test_data = pd.get_dummies(test_data)

In [30]:
test_data.shape

(4500, 99)

In [31]:
data.shape

(10500, 112)

In [32]:
test_data.shape
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [33]:
# Devide data set into train set , test set
X = data.drop('gender', axis = 1)
Y = data.gender

val_size = 0.40
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=val_size)

KeyError: "['gender'] not found in axis"

In [34]:
X.shape, X_train.shape, X_val.shape

NameError: name 'X' is not defined

In [94]:
clf = RandomForestClassifier()
clf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [95]:
Y_pred = clf.predict(test_data)
#Y_pred = clf.predict(X_val)
clf.score(X_train, Y_train)
acc_random_forest = round(clf.score(X_train, Y_train) * 100, 2)
acc_random_forest

94.43

In [96]:
Y_pred.shape

(4500,)

In [97]:
pred=pd.DataFrame(Y_pred)
sub_df=pd.read_csv('sample_submission_opxHi4g.csv')

In [98]:
pred.shape

(4500, 1)

In [1]:
datasets=pd.concat([sub_df['session_id'],pred],axis=1)
datasets.columns=['session_id','gender']

NameError: name 'pd' is not defined

In [106]:
datasets.to_csv('sample.csv',index=False)

In [107]:
datasets

Unnamed: 0,session_id,gender
0,u12112,0
1,u19725,0
2,u11795,0
3,u22639,0
4,u18034,0
...,...,...
4495,u23966,1
4496,u20527,0
4497,u13253,0
4498,u17094,1


In [108]:
print("Evalute based on validation set")
print("f1 : " + " %s" % f1_score(Y_val, clf.predict(X_val)))
print("accuracy score" + " %s" % accuracy_score(Y_val, clf.predict(X_val)))
print("recall score micro: " + " %s" % recall_score(Y_val, clf.predict(X_val), average='micro'))
print("recall score macro: " + " %s" % recall_score(Y_val, clf.predict(X_val), average='macro'))

Evalute based on validation set
f1 :  0.6085904416212946
accuracy score 0.8459523809523809
recall score micro:  0.8459523809523809
recall score macro:  0.7363496604946473


In [109]:
final_score = Counter(zip(Y, clf.predict(X)))
tp= final_score[1,1]
tn= final_score[0,0]
fp= final_score[0,1]
fn= final_score[1,0]
acc = (tp+tn)/len(Y)
recall1 = tp/(tp+fn)
recall2 = tn/(tn+fp)
recall_n = (recall1 + recall2) / 2
print("Performance on total data set:")
print("Accuracy " + "%.2f" % acc)
print("RECALL AVG FINAL  : " + "%.2f" % recall_n) 

Performance on total data set:
Accuracy 0.90
RECALL AVG FINAL  : 0.83


In [35]:
#oversampling
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'