In [350]:
import numpy as np
import pandas as pd
import io
from sklearn.model_selection import GridSearchCV
from google.colab import files
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [118]:
test_f = files.upload()

Saving test.csv to test (1).csv


In [119]:
train_f = files.upload()

Saving train.csv to train (1).csv


In [482]:
train = pd.read_csv(io.BytesIO(train_f['train.csv']),encoding = "ISO-8859-1")
test = pd.read_csv(io.BytesIO(test_f['test.csv']),encoding = "ISO-8859-1")

In [483]:
train

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,191,8.0,1,16,1,8,10,5,10.0,6,...,7.0,7.0,7.0,7.0,7.0,,,,,
1,131,10.0,1,18,1,5,10,10,,10,...,4.0,4.0,4.0,4.0,4.0,,,,,
2,531,1.0,1,2,2,21,22,15,14.0,2,...,,,,,,,,,,
3,549,19.0,1,38,2,21,22,3,7.0,19,...,7.0,9.0,8.0,7.0,8.0,5.0,8.0,8.0,6.0,8.0
4,403,7.0,1,14,2,15,18,9,15.0,13,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,81,6.0,0,11,2,4,18,1,,7,...,8.0,7.0,7.0,8.0,6.0,,,,,
414,184,1.0,1,2,1,8,10,2,3.0,10,...,6.0,9.0,9.0,7.0,6.0,,,,,
415,438,9.0,0,17,2,17,14,13,13.0,8,...,7.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,7.0,7.0
416,190,7.0,1,14,1,8,10,5,6.0,10,...,,,,,,,,,,


In [484]:
# count nulls in each column 
pd.DataFrame(train.isnull().sum())

Unnamed: 0,0
iid,0
id,0
gender,0
idg,0
condtn,0
...,...
attr5_3,320
sinc5_3,320
intel5_3,320
fun5_3,320


In [485]:
# get cols with more then 76 precent nulls
columns = train.columns
percent_missing = train.isnull().sum() * 100 / len(train)
missing_value_df = pd.DataFrame({'column_name': columns, 'percent_missing': percent_missing})

missing_drop = list(missing_value_df[missing_value_df.percent_missing>76].column_name)
missing_drop

['expnum',
 'numdat_3',
 'num_in_3',
 'attr7_3',
 'sinc7_3',
 'intel7_3',
 'fun7_3',
 'amb7_3',
 'shar7_3',
 'shar2_3',
 'attr5_3',
 'sinc5_3',
 'intel5_3',
 'fun5_3',
 'amb5_3']

In [486]:
#  fill missing values
cat_feature = list(set(train.columns) - set(train._get_numeric_data().columns))
enc = OrdinalEncoder()
enc.fit(train[cat_feature])
train[cat_feature]= enc.transform(train[cat_feature])
train.fillna(train.mode().iloc[0],inplace=True)


In [487]:
#  fill missing values
cat_feature = list(set(test.columns) - set(test._get_numeric_data().columns))
enc = OrdinalEncoder()
enc.fit(test[cat_feature])
test[cat_feature]= enc.transform(test[cat_feature])
test.fillna(train.mode().iloc[0],inplace=True)

In [488]:
# drop the columns with at least 76% null
train = train.drop(columns= missing_drop)
test = test.drop(columns= missing_drop)
train = train.drop(columns= ['iid', 'id'])

In [489]:
# check for duplicates
sum([1 for i in train.duplicated().tolist() if i])

0

In [490]:
# pid, museums
df_corr = pd.DataFrame(train.corr().unstack().sort_values(ascending=False))
df_corr

Unnamed: 0,Unnamed: 1,0
gender,gender,1.000000
attr5_1,attr5_1,1.000000
exphappy,exphappy,1.000000
yoga,yoga,1.000000
shopping,shopping,1.000000
...,...,...
met,wave,-0.546692
intel2_1,attr2_1,-0.572622
attr2_1,intel2_1,-0.572622
sinc2_1,attr2_1,-0.615473


In [491]:
# split to x train and y train
X_train = train.drop(columns= ['match'])
y_train = train['match']
X_test = test.drop(columns=['iid', 'id'])


In [492]:
# remove columns with high corr
X_train = X_train.drop(columns= ['pid','museums'])
X_test = X_test.drop(columns= ['pid','museums'])

In [493]:
# Remove outliers that 3 standard deviations away from the mean
for col in X_train.columns:  
  mean = X_train[col].mean()
  sd = X_train[col].std()
  outliers_index = X_train.index[(X_train[col] > mean+(3*sd))].tolist()
X_train = X_train.drop(outliers_index)
y_train = y_train.drop(outliers_index)


In [494]:
parameters = {
    'var_smoothing': [1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14,
                      1e-15, 1e-16, 1e-16, 1e-17, 1e-18, 1e-19, 1e-20, 1e-21,
                      1e-22, 1e-23, 1e-24, 1e-25, 1e-26]}
# tune the Gaussian
algorithm = GridSearchCV(GaussianNB(), parameters, cv=5, verbose=0, n_jobs=-1)
y_pred = algorithm.fit(X_train, y_train).predict(X_test)

In [495]:
algorithm.score(X_train,y_train)

1.0

In [496]:
algorithm.best_estimator_

GaussianNB(var_smoothing=1e-20)

In [497]:
solution = pd.DataFrame()
solution['id'] = test['iid'].astype(str) + '&' + test['pid'].astype(int).astype(str)
solution['match'] = y_pred
solution.to_csv("initial_sub.csv",index=False)

In [498]:
files.download('initial_sub.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>