In [1]:
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
df_train= pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
columns_numeric= ['Pclass','Age','SibSp','Parch','Fare']
columns_cat=['Sex','Embarked']

In [6]:
df_train[columns_numeric+columns_cat].isna().sum()

Pclass        0
Age         177
SibSp         0
Parch         0
Fare          0
Sex           0
Embarked      2
dtype: int64

In [7]:
df_test[columns_numeric+columns_cat].isna().sum()

Pclass       0
Age         86
SibSp        0
Parch        0
Fare         1
Sex          0
Embarked     0
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
#Imputation
imputer_numeric=SimpleImputer(strategy= "median")
imputer_numeric.fit(df_train[columns_numeric])  # find median of all num columns

imputer_cat = SimpleImputer(strategy="most_frequent")
imputer_cat.fit(df_train[columns_cat])     # find mode of all cat columns

df_train[columns_numeric]=imputer_numeric.transform(df_train[columns_numeric]) #to fill values
df_test[columns_numeric]=imputer_numeric.transform(df_test[columns_numeric])  ##to fill values

df_train[columns_cat]=imputer_cat.transform(df_train[columns_cat])
df_test[columns_cat]=imputer_cat.transform(df_test[columns_cat])

In [10]:
df_train[columns_numeric+columns_cat].isna().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Embarked    0
dtype: int64

In [11]:
df_test[columns_numeric+columns_cat].isna().sum()

Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
Sex         0
Embarked    0
dtype: int64

# Encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(df_train[columns_cat]) #Finds number of categories


OneHotEncoder(handle_unknown='ignore')

In [14]:
# encoder.categories_

In [15]:
encoder.get_feature_names()

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [16]:
encoder.transform(df_train[columns_cat])

<891x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1782 stored elements in Compressed Sparse Row format>

In [17]:
encoder.transform(df_train[columns_cat]).toarray()

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [18]:
df_train_encoded= pd.DataFrame(encoder.transform(df_train[columns_cat]).toarray(),columns=encoder.get_feature_names())

df_test_encoded= pd.DataFrame(encoder.transform(df_test[columns_cat]).toarray(),columns=encoder.get_feature_names())


# Scaling of numeric data

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_train[columns_numeric])

df_train[columns_numeric]= scaler.transform(df_train[columns_numeric])
df_test[columns_numeric]= scaler.transform(df_test[columns_numeric])

In [20]:
df_train_final = pd.concat([df_train[columns_numeric],df_train_encoded],axis = 1)
df_train_final.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,x0_female,x0_male,x1_C,x1_Q,x1_S
0,1.0,0.271174,0.125,0.0,0.014151,0.0,1.0,0.0,0.0,1.0
1,0.0,0.472229,0.125,0.0,0.139136,1.0,0.0,1.0,0.0,0.0
2,1.0,0.321438,0.0,0.0,0.015469,1.0,0.0,0.0,0.0,1.0
3,0.0,0.434531,0.125,0.0,0.103644,1.0,0.0,0.0,0.0,1.0
4,1.0,0.434531,0.0,0.0,0.015713,0.0,1.0,0.0,0.0,1.0


In [21]:
df_test_final = pd.concat([df_test[columns_numeric],df_test_encoded],axis = 1)
df_test_final.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,x0_female,x0_male,x1_C,x1_Q,x1_S
0,1.0,0.428248,0.0,0.0,0.015282,0.0,1.0,0.0,1.0,0.0
1,1.0,0.585323,0.125,0.0,0.013663,1.0,0.0,0.0,0.0,1.0
2,0.5,0.773813,0.0,0.0,0.018909,0.0,1.0,0.0,1.0,0.0
3,1.0,0.334004,0.0,0.0,0.016908,0.0,1.0,0.0,0.0,1.0
4,1.0,0.271174,0.125,0.166667,0.023984,1.0,0.0,0.0,0.0,1.0


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [23]:
models = {'Logistic_Reg':LogisticRegression(),
          'KNN': KNeighborsClassifier(n_neighbors=3),
          'SVM':SVC(kernel='rbf'),
          'RF':RandomForestClassifier(random_state=1)}

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(df_train_final,df_train['Survived'],test_size=0.2,random_state=15)

In [25]:
for name,model in  models.items():
    model.fit(X_train,y_train)
    print(name,model.score(X_val,y_val))

Logistic_Reg 0.7932960893854749
KNN 0.8324022346368715
SVM 0.8100558659217877
RF 0.7877094972067039


In [26]:
model_final = KNeighborsClassifier(n_neighbors=3)
model_final.fit((df_train_final),df_train['Survived'])

KNeighborsClassifier(n_neighbors=3)

In [27]:
yp= model_final.predict(df_test_final)

In [28]:
yp

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [29]:
df_test['Survived']=yp
df_test[['PassengerId','Survived']].to_csv('Submission_final.csv',index=False)

In [30]:
def get_predictions (df):
    df[columns_numeric]=imputer_numeric.transform(df[columns_numeric])
    df[columns_cat]=imputer_cat.transform(df[columns_cat])
    
    df_encoded= pd.DataFrame(encoder.transform(df[columns_cat]).toarray(),
                                   columns=encoder.get_feature_names())
    
    df[columns_numeric]=scaler.transform(df[columns_numeric])
    
    df_final = pd.concat([df[columns_numeric],df_encoded],axis=1)
    
    return model_final.predict(df_final)

In [31]:
new_df = pd.read_csv('./test.csv')
get_predictions(new_df)

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [32]:
-*-

SyntaxError: invalid syntax (<ipython-input-32-1ec6fa7d950f>, line 1)