In [1]:
import sys
!{sys.executable} -m pip install xgboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score



In [2]:
df = pd.read_csv('train.csv')
columns = list(df.columns)
non_medical = columns[0:79]
medical = columns[79:127]

med  = df[medical]
med = med.sum(axis = 1)

df['Product_Info_2'] = pd.Categorical(df['Product_Info_2'])
dfDummies = pd.get_dummies(df['Product_Info_2'], prefix = 'P2')

train = df[non_medical]
train = train.drop(columns = 'Product_Info_2')
train['Response'] = df['Response']
train['Keyword'] = med
train = pd.concat([train, dfDummies], axis=1)
train.head

<bound method NDFrame.head of           Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0          2               1              10        0.076923               2   
1          5               1              26        0.076923               2   
2          6               1              26        0.076923               2   
3          7               1              10        0.487179               2   
4          8               1              26        0.230769               2   
5         10               1              26        0.230769               3   
6         11               1              10        0.166194               2   
7         14               1              26        0.076923               2   
8         15               1              26        0.230769               2   
9         16               1              21        0.076923               2   
10        17               1              26        0.128205               2   
11        

In [3]:
train.Keyword

0        0
1        0
2        0
3        1
4        0
5        2
6        0
7        0
8        1
9        2
10       4
11       1
12       1
13       1
14       2
15       3
16       1
17       0
18       1
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       1
27       0
28       2
29       2
        ..
59351    0
59352    1
59353    3
59354    1
59355    1
59356    2
59357    0
59358    0
59359    1
59360    1
59361    0
59362    0
59363    0
59364    1
59365    0
59366    2
59367    1
59368    0
59369    6
59370    0
59371    1
59372    4
59373    0
59374    0
59375    1
59376    0
59377    0
59378    1
59379    2
59380    0
Name: Keyword, Length: 59381, dtype: int64

In [4]:
df = pd.read_csv('test.csv')
columns = list(df.columns)
non_medical = columns[0:79]
medical = columns[79:127]

med  = df[medical]
med = med.sum(axis = 1)

df['Product_Info_2'] = pd.Categorical(df['Product_Info_2'])
dfDummies = pd.get_dummies(df['Product_Info_2'], prefix = 'P2')

test = df[non_medical]
test = test.drop(columns = 'Product_Info_2')
test['Keyword'] = med
test = pd.concat([test, dfDummies], axis=1)

In [5]:
clf = LogisticRegression(solver='lbfgs', multi_class='multinomial',
                           random_state=1, max_iter = 100)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf4 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf5 = DecisionTreeClassifier(max_depth=20, min_samples_split=20,
     random_state=0)
clf6 = ExtraTreesClassifier(n_estimators=100, max_depth=20,
     min_samples_split=20, random_state=0)
clf7 = XGBClassifier()

In [6]:
clf1 = VotingClassifier(estimators=[
         ('lr', clf), ('rf', clf2), ('gnb', clf3), ('ada', clf4), ('5', clf5), ('6', clf6), ('7', clf7)], voting='hard')

In [7]:
X = train
X = X.drop(columns  = 'Id')
y = X['Response']
X = X.fillna(-1)

X = X.drop(columns = 'Response')

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=0)

scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
clf1 = clf1.fit(X_train, y_train)
print(clf1.score(X_train,y_train))



0.7230909878265891


In [9]:
predictions_test = clf1.predict(X_test)
print(clf1.score(X_test, y_test))

0.5551501543642997


In [10]:
from sklearn.metrics import classification_report, confusion_matrix
CM = confusion_matrix(y_test, predictions_test)
print(CM)
print(classification_report(y_test, predictions_test))

[[ 406  313   20   39  133  347  159  406]
 [ 211  561   19   40  195  376  172  389]
 [  27   40   91   73   18   53    3   10]
 [  27   10   21  244    0   47    4   57]
 [  73  222    1    0  754  292   72  155]
 [ 161  225    0    8  141 1717  353  743]
 [  78   64    0    2   18  490  830 1002]
 [  33   29    0    2   18  306  228 5287]]
              precision    recall  f1-score   support

           1       0.40      0.22      0.29      1823
           2       0.38      0.29      0.33      1963
           3       0.60      0.29      0.39       315
           4       0.60      0.60      0.60       410
           5       0.59      0.48      0.53      1569
           6       0.47      0.51      0.49      3348
           7       0.46      0.33      0.39      2484
           8       0.66      0.90      0.76      5903

    accuracy                           0.56     17815
   macro avg       0.52      0.45      0.47     17815
weighted avg       0.53      0.56      0.53     17815



In [11]:
test_noID = test.drop(columns = ['Id'])
test_noID = test_noID.fillna(-1)

scaler = StandardScaler()
scaler.fit(test_noID)
test_noID = scaler.transform(test_noID)
predictions_test = clf1.predict(test_noID)
        
test['Response'] = predictions_test
submission = test[['Id', 'Response']]
submission.set_index('Id', inplace = True)
submission.to_csv('Submission.csv', float_format='%.0f')
print(submission)

       Response
Id             
1             7
3             8
4             6
9             8
12            8
13            8
21            8
28            8
30            7
36            8
38            8
43            8
45            4
48            8
50            4
51            8
54            7
55            8
59            8
62            1
63            8
66            8
69            8
82            8
83            6
84            6
86            8
89            8
90            2
92            8
...         ...
79004         7
79007         8
79020         6
79022         1
79027         1
79028         8
79031         8
79035         1
79038         8
79047         8
79048         5
79051         6
79054         5
79060         6
79064         8
79065         5
79067         8
79071         5
79072         6
79073         8
79080         6
79083         2
79084         8
79085         6
79089         8
79093         8
79099         8
79102         1
79125         2
79129   