In [91]:
import pandas as pd
import numpy as np
import os

In [92]:
processed_data_path = os.path.join(os.path.pardir, 'data','processed')
train_file_path= os.path.join(processed_data_path, "train.csv")
test_file_path= os.path.join(processed_data_path, "test.csv")

In [93]:
train_df = pd.read_csv(train_file_path, index_col="PassengerId")
test_df = pd.read_csv(test_file_path, index_col="PassengerId")

In [94]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [95]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

# Data Prepration

In [96]:
X = train_df.loc[:, "Age":].as_matrix().astype("float")
y= train_df["Survived"].ravel()

In [97]:
print (X.shape, y.shape)

(891, 32) (891,)


In [98]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state = 0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [99]:
print ("Mean survival in train : {0:.3f}".format(np.mean(y_train)))
print ("Mean survival in test : {0:.3f}".format(np.mean(y_test)))

Mean survival in train : 0.383
Mean survival in test : 0.385


In [100]:
import sklearn
sklearn.__version__

'0.19.1'

In [101]:
from sklearn.dummy import DummyClassifier

In [102]:
model_dummy= DummyClassifier(strategy="most_frequent", random_state =0)

In [103]:
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [104]:
#Score means the accuracy
print ("Score of baseline model: {0:.2f}".format(model_dummy.score(X_test, y_test)));

Score of baseline model: 0.61


In [105]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [106]:
print ("Accuracy score is: {0:.2f}".format(accuracy_score(y_test, model_dummy.predict(X_test))))

Accuracy score is: 0.61


In [107]:
print ("Confusion Matrix for baseline model: \n{0}".format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion Matrix for baseline model: 
[[110   0]
 [ 69   0]]


In [108]:
print ("Precision score is: {0:.2f}".format(precision_score(y_test, model_dummy.predict(X_test))))
print ("Recall score is: {0:.2f}".format(recall_score(y_test, model_dummy.predict(X_test))))

Precision score is: 0.00
Recall score is: 0.00


  'precision', 'predicted', average, warn_for)


# Kaggle Submission File

In [109]:
test_X = test_df.as_matrix().astype("float")

In [110]:
predicitons = model_dummy.predict(test_X)

In [111]:
df_submission = pd.DataFrame({"PassengerId": test_df.index, "Survived":predicitons})
df_submission.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,0
415,1307,0
416,1308,0
417,1309,0


In [112]:
submission_data_path = os.path.join(os.path.pardir, "data", "external")
submission_file_path = os.path.join(submission_data_path, "01_dummy.csv")
df_submission.to_csv(submission_file_path, index=False)

In [113]:
def GetSubmissionFile(model, filename):
    test_X = test_df.as_matrix().astype("float")
    predicitons = model.predict(test_X)
    df_submission = pd.DataFrame({"PassengerId": test_df.index, "Survived":predicitons})
    submission_data_path = os.path.join(os.path.pardir, "data", "external")
    submission_file_path = os.path.join(submission_data_path, filename)
    df_submission.to_csv(submission_file_path, index=False)
    print ("New File {0} created".format(filename))

# Logistic Regression Model

In [114]:
from sklearn.linear_model import LogisticRegression

In [115]:
model_lr_1 = LogisticRegression(random_state=0)

In [116]:
model_lr_1.fit(X_train, y_train);

In [184]:
print (model_lr_1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [117]:
print ("Score is: {0:.3f}".format(model_lr_1.score(X_test, y_test)))

Score is: 0.832


In [118]:
GetSubmissionFile(model_lr_1, "02_lr.csv")

New File 02_lr.csv created


# Hyperparameter Optimization

In [129]:
model_lr = LogisticRegression(random_state=0)

In [130]:
from sklearn.model_selection import GridSearchCV

In [171]:
parameters = {'C':[1.0,10.0,50.0,100.0,1000.0], 'penalty': ['11','12']}
cif= GridSearchCV(model_lr, param_grid=parameters, cv=3)

In [174]:
#Error while runnig
#clf.fit(X_train, y_train);

# Feature Normalization and Standardization

In [148]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [151]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [153]:
X_train_scaled[: ,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [154]:
X_test_scaled= scaler.transform(X_test)

# Feature Standardization

In [155]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

## Create model after standardization

In [160]:
model_lr= LogisticRegression()
parameters = {"C":[1.0,10.0,50.0,100.0,1000.0], 'penalty': ['11','12']}
cif= GridSearchCV(model_lr, param_grid=parameters, cv=3)

## Model Persistence

In [203]:
import pickle

In [204]:
model_file_path = os.path.join(os.path.pardir, "models","lr_model.pkl")
scaler_file_path = os.path.join(os.path.pardir, "models", "lr_scaler.pkl")

In [205]:
model_file_pickle = open(model_file_path, "wb")
scaler_file_pickle = open(scaler_file_path, "wb")

In [206]:
pickle.dump(model_lr_1, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [207]:
model_file_pickle.close()
scaler_file_pickle.close()

### Load Persisted Files

In [208]:
model_file_pickle = open(model_file_path, "rb")
scaler_file_pickle = open(scaler_file_path, "rb")

In [209]:
lr_loaded = pickle.load(model_file_pickle)

In [210]:
model_file_pickle.close()
scaler_file_pickle.close()

In [212]:
print ("score of basic linear regression model is: {0:.3f}".format(lr_loaded.score(X_test, y_test)))

score of basic linear regression model is: 0.832


In [213]:
#GetSubmissionFile(lr_loaded, "lr_loaded.csv")

New File lr_loaded.csv created
