In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:


data = pd.read_csv("train.csv")
data['Age'].fillna(0, inplace=True)
data['Embarked'].fillna(0, inplace=True)
data['Family'] = data['SibSp'] + data['Parch']
data['Sex'] = pd.Categorical(data['Sex']).codes
embarked_dummies = pd.get_dummies(data['Embarked'], prefix='E', drop_first=False)
data.drop(['Ticket', 'Fare', 'Cabin','Embarked','Name','SibSp','Parch'], inplace=True, axis=1)
data = pd.concat([data, embarked_dummies], axis=1)



In [3]:
y_data=data['Survived']
X_data=data[['Pclass','Sex','Age','Family','E_C','E_Q','E_S']]

In [4]:
print(y_data)
print(X_data)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
     Pclass  Sex   Age  Family  E_C  E_Q  E_S
0         3    1  22.0       1    0    0    1
1         1    0  38.0       1    1    0    0
2         3    0  26.0       0    0    0    1
3         1    0  35.0       1    0    0    1
4         3    1  35.0       0    0    0    1
..      ...  ...   ...     ...  ...  ...  ...
886       2    1  27.0       0    0    0    1
887       1    0  19.0       0    0    0    1
888       3    0   0.0       3    0    0    1
889       1    1  26.0       0    1    0    0
890       3    1  32.0       0    0    1    0

[891 rows x 7 columns]


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X_data, y_data, test_size=0.2, random_state=0)

In [6]:
lr_model=LogisticRegression()
lr_model.fit(X_train,y_train)

LogisticRegression()

In [7]:
y_pred=lr_model.predict(X_val)

In [8]:
Accuracy=accuracy_score(y_val, y_pred)
print(Accuracy)

0.8044692737430168


In [9]:
data_t = pd.read_csv("test.csv")
data_t['Age'].fillna(0, inplace=True)
data_t['Embarked'].fillna(0, inplace=True)
data_t['Family'] = data_t['SibSp'] + data_t['Parch']
data_t['Sex'] = pd.Categorical(data_t['Sex']).codes
embarked_dummies_t = pd.get_dummies(data_t['Embarked'], prefix='E', drop_first=False)
data_t.drop(['Ticket', 'Fare', 'Cabin','Embarked','Name','SibSp','Parch'], inplace=True, axis=1)
data_t = pd.concat([data_t, embarked_dummies_t], axis=1)

In [10]:
test=data_t[['Pclass','Sex','Age','Family','E_C','E_Q','E_S']]
print(test)

     Pclass  Sex   Age  Family  E_C  E_Q  E_S
0         3    1  34.5       0    0    1    0
1         3    0  47.0       1    0    0    1
2         2    1  62.0       0    0    1    0
3         3    1  27.0       0    0    0    1
4         3    0  22.0       2    0    0    1
..      ...  ...   ...     ...  ...  ...  ...
413       3    1   0.0       0    0    0    1
414       1    0  39.0       0    1    0    0
415       3    1  38.5       0    0    0    1
416       3    1   0.0       0    0    0    1
417       3    1   0.0       2    1    0    0

[418 rows x 7 columns]


In [11]:
y_pred_t=lr_model.predict(test)
print(y_pred_t)
print(y_pred_t.dtype)
print(type(y_pred_t))
print(len(y_pred_t))

[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1
 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0
 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
int64
<class 'numpy.ndarray'>
418


In [15]:
a=np.arange(892,1310)
print(len(a))

418


In [16]:
y_final=np.zeros((418,2))
y_final[:,0]=a
y_final[:,1]=y_pred_t

In [17]:
df = pd.DataFrame(y_final, columns =['Passenger', 'Survived'])

In [18]:
df.to_csv('Submit2.csv',index=False)