In [213]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from sklearn.preprocessing import MinMaxScaler

In [214]:
df_train=pd.read_csv('data_house_price/train.csv')
df_test=pd.read_csv('data_house_price/test.csv')

In [215]:
df_train.shape

(8693, 14)

In [216]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [217]:
df_train.isna().sum().loc[lambda x: x>0]

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
dtype: int64

In [218]:
df_test.isna().sum().loc[lambda x: x>0]

HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [219]:
def fill_na(df):
    print ('filling NaN...')
    for col in list(df):
        if df[col].isna().any():
            df[col]= df[col].fillna(0)

fill_na(df_train)
fill_na(df_test)
# check after fillna 
df_train.isna().any().any(), df_test.isna().any().any()

filling NaN...
filling NaN...


(False, False)

In [220]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [221]:
def calc_additional_var(df):
    
    df['Deck'] = df['Cabin'].apply(lambda x: str(x).split('/')[0])
    df['Side'] = df['Cabin'].apply(lambda x: str(x).split('/')[-1])
    df['Spending'] = df['RoomService']+df['FoodCourt']+df['ShoppingMall']+df['Spa']+df['VRDeck']
    return df

In [222]:
df_train = calc_additional_var(df_train)
df_test = calc_additional_var(df_test)

# #Encoding categorizied variable

In [223]:
columns_to_encode=['Destination','HomePlanet','CryoSleep','VIP','Deck','Side']
columns_to_remove=['PassengerId','Cabin','Name','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [224]:
df_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Side,Spending
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,P,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,S,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,S,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,S,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,S,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,P,8536.0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,S,0.0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,S,1873.0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,S,4637.0


In [225]:
df_train.drop(columns_to_remove,axis=1,inplace=True)
test_passenger_id=df_test['PassengerId']
df_test.drop(columns_to_remove,axis=1,inplace=True)
df_train

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,Deck,Side,Spending
0,Europa,False,TRAPPIST-1e,39.0,False,False,B,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,True,F,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,False,A,S,10383.0
3,Europa,False,TRAPPIST-1e,33.0,False,False,A,S,5176.0
4,Earth,False,TRAPPIST-1e,16.0,False,True,F,S,1091.0
...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,False,A,P,8536.0
8689,Earth,True,PSO J318.5-22,18.0,False,False,G,S,0.0
8690,Earth,False,TRAPPIST-1e,26.0,False,True,G,S,1873.0
8691,Europa,False,55 Cancri e,32.0,False,False,E,S,4637.0


In [226]:
df_train.corr()

  df_train.corr()


Unnamed: 0,Age,Transported,Spending
Age,1.0,-0.07133,0.179475
Transported,-0.07133,1.0,-0.199514
Spending,0.179475,-0.199514,1.0


In [227]:
X_train = df_train.drop('Transported',axis=1)
y_train = df_train['Transported']
X_test = df_test

In [228]:
from sklearn.preprocessing import LabelEncoder

In [229]:
def label_encoder(X_train,X_test):    
    print ('label encoding.')
    X_train = X_train.copy() # to avoid warning related to setting the copy...
    X_test = X_test.copy() # to avoid warning related to setting the copy...

    for col in columns_to_encode:
        le = LabelEncoder().fit(X_train[col].astype(str)) # convert to str first since le may fail due to difference type of data
        X_train[col] = le.transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
    return X_train,X_test

X_train,X_test= label_encoder(X_train,X_test)
X_train.dtypes

# # apply in case of values not from traing set 
# le = LabelEncoder().fit(df_train_copy['foo']) 
# df_train_copy['foo'] = le.transform(df_train_copy['foo'])
# le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
# # print (le_surname_dict)
# df_test_copy['foo']= df_test_copy['foo'].apply(lambda x: le_dict.get(x, -1)) # returns -1 if not found 



label encoding.


HomePlanet       int32
CryoSleep        int32
Destination      int32
Age            float64
VIP              int32
Deck             int32
Side             int32
Spending       float64
dtype: object

In [230]:
def normalize(X_train, X_test):
    print ('normalizing.')
    scaler= MinMaxScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled 

X_train_scaled, X_test_scaled  = normalize(X_train, X_test)
type(X_train_scaled)

normalizing.


numpy.ndarray

In [231]:
from sklearn.model_selection import train_test_split

X_tr,X_val,y_tr,y_val = train_test_split(X_train_scaled,y_train,test_size=0.3)

In [232]:
X_val.shape

(2608, 8)

In [233]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from xgboost import XGBClassifier


In [None]:
clf = SVC(kernel='rbf')
grid_values = {'C': [0.005, 0.01, 0.02, 0.03, 0.1, 1, 100, 10000], 'gamma':[0.001, 0.01, 0.1,1]}
grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='accuracy')
grid_clf.fit(X_tr, y_tr)
print('Grid best parameter (max f1 ): ', grid_clf.best_params_) 
print('Grid best score (f1): ', grid_clf.best_score_) 
print("train accuracy= {:.3%}".format(grid_clf.score(X_tr,y_tr)))
 
print("test accuracy= {:.3%}".format(grid_clf.score(X_val,y_val)))