In [55]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [56]:
def make_simple_plot():
    fig, axes=plt.subplots(figsize=(12,5), nrows=1, ncols=2);
    axes[0].set_ylabel("$y$")
    axes[0].set_xlabel("$x$")
    axes[1].set_xlabel("$x$")
    axes[1].set_yticklabels([])
    axes[0].set_ylim([-2,2])
    axes[1].set_ylim([-2,2])
    plt.tight_layout();
    return axes
def make_plot():
    fig, axes=plt.subplots(figsize=(20,8), nrows=1, ncols=2);
    axes[0].set_ylabel("$p_R$")
    axes[0].set_xlabel("$x$")
    axes[1].set_xlabel("$x$")
    axes[1].set_yticklabels([])
    axes[0].set_ylim([0,1])
    axes[1].set_ylim([0,1])
    axes[0].set_xlim([0,1])
    axes[1].set_xlim([0,1])
    plt.tight_layout();
    return axes

In [57]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [58]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [59]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [60]:
print("shape of train data:"+str(train_data.shape))
print("test data shape:"+str(test_data.shape))

shape of train data:(891, 12)
test data shape:(418, 11)


In [61]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [62]:
train_data['Age'].fillna(np.mean(train_data['Age']),inplace=True)

In [63]:
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0],inplace=True)

In [64]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [65]:
categorical_variables=train_data.dtypes[train_data.dtypes=='object'].index.values

In [66]:
train_data[categorical_variables].apply(lambda x: len(x.unique()))

Name        891
Sex           2
Ticket      681
Cabin       148
Embarked      3
dtype: int64

In [67]:
test_data[categorical_variables].apply(lambda x: len(x.unique()))

Name        418
Sex           2
Ticket      363
Cabin        77
Embarked      3
dtype: int64

In [68]:
train_data[train_data['Survived']==1].apply(lambda x: len(x.unique()))

PassengerId    342
Survived         1
Pclass           3
Name           342
Sex              2
Age             66
SibSp            5
Parch            5
Ticket         260
Fare           153
Cabin          102
Embarked         3
dtype: int64

In [69]:
train_data[train_data['Survived']==1].isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          206
Embarked         0
dtype: int64

In [70]:
categorical_variables=[x for x in train_data.columns if train_data[x].dtypes=='O' and x not in ['Name','Ticket','Cabin']]

In [71]:
categorical_variables

['Sex', 'Embarked']

In [72]:
from sklearn.preprocessing import LabelEncoder

In [73]:
from collections import defaultdict
d= defaultdict(LabelEncoder)

In [74]:
encoded_train_data=pd.DataFrame(data=train_data,copy=True)

In [75]:
encoded_test_data=pd.DataFrame(data=test_data,copy=True)

In [76]:
encoded_train_data[categorical_variables]=encoded_train_data[categorical_variables].apply(lambda x: d[x.name].fit_transform(x))

In [77]:
encoded_test_data[categorical_variables]=encoded_test_data[categorical_variables].apply(lambda x: d[x.name].transform(x))

In [78]:
encoded_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [79]:
independent_variables=[ x for x in train_data.columns if x not in ['Ticket','Cabin','Name','PassengerId','Survived']]

In [80]:
from sklearn.model_selection import train_test_split

In [81]:
X_train, X_test, y_train, y_test= train_test_split(encoded_train_data[independent_variables], encoded_train_data['Survived'], test_size=0.33, random_state= 0)

In [88]:
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [95]:
clsfr=LogisticRegression()
parameters = {"C": [0.0001,0.001,0.01,0.1,1,10,100]}
fitmodel=GridSearchCV(clsfr,param_grid=parameters, cv=5, scoring="accuracy")
fitmodel.fit(X_train,y_train)
fitmodel.best_estimator_,fitmodel.best_params_,fitmodel.best_score_,fitmodel.grid_scores_

(LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 {'C': 0.1},
 0.7986577181208053,
 [mean: 0.65604, std: 0.01685, params: {'C': 0.0001},
  mean: 0.67282, std: 0.00816, params: {'C': 0.001},
  mean: 0.70470, std: 0.01637, params: {'C': 0.01},
  mean: 0.79866, std: 0.01729, params: {'C': 0.1},
  mean: 0.79530, std: 0.02128, params: {'C': 1},
  mean: 0.79530, std: 0.01991, params: {'C': 10},
  mean: 0.79362, std: 0.02260, params: {'C': 100}])

In [96]:
clsfr=LogisticRegression(C=fitmodel.best_params_['C'])
clsfr.fit(X_train,y_train)
ypred=clsfr.predict(X_test)
accuracy_score(ypred, y_test)

0.7830508474576271

In [97]:
clsfr.fit(encoded_train_data[independent_variables],encoded_train_data['Survived'])


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [98]:
test_predictions=clsfr.predict(encoded_test_data[independent_variables])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').