# IMPORTING PACKAGES.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# DATA ANALYSIS.

In [None]:
# READ A COMMA-SEPARETED VALUES (CSV)FILE INTO PANDAS DATA FRAME.
train=pd.read_csv(r"../input/titanic/train.csv")
train.head(10)

In [None]:
test=pd.read_csv(r"../input/titanic/test.csv")
test.head(10)

In [None]:
# COUNT THE NUMBER OF ROWS AND COLUMNS IN THE TRAIN DATASET AND TEST DATASET.
print(train.shape)
print(test.shape)


In [None]:
#LOOK AT THE COLUMNS OF TRAIN DATASET AND TEST DATASET.
print(train.columns)
print(test.columns)

In [None]:
train.isnull().sum()

In [None]:
# GET THE COUNT OF THE NUMBER OF SURVIOURS,SEX,EMBARKED,SIBSP,TICKET.
print(train["Survived"].value_counts())
print(train["Sex"].value_counts())
print(train["Embarked"].value_counts())
print(train["SibSp"].value_counts())
print(train["Ticket"].value_counts())

In [None]:
test.isnull().sum()

In [None]:
sns.countplot(x="Survived",data=train)

In [None]:
sns.countplot(x="Sex",data=train)

In [None]:
#LOOK AT THE SURVIVAL RATE BY SEX.
train.groupby('Sex')[['Survived']].mean()

In [None]:
#LOOK AT SURVIVAL RATE BY SEX AND PCLASS
train.pivot_table('Survived',index='Sex',columns='Pclass')

In [None]:
#LOOK AT SURVIVAL RATE BY SEX AND PCLASS VISUALLY.
train.pivot_table('Survived',index='Sex',columns='Pclass').plot()

In [None]:
#PLOT THE SURVIVAL RATE OF EACH CLASS
sns.barplot(x='Pclass',y='Survived',data=train)

In [None]:
#LOOK AT SURVIVAL RATE BY SEX ,AGE AND PCLASS.
age=pd.cut(train['Age'],[0,18,80])
train.pivot_table('Survived',['Sex','Age'],'Pclass')

In [None]:
#COUNT THE EMPTY VALUES IN EACH COLUMN
train.isna().sum()

In [None]:
#LOOK AT ALL OF THE VALUES IN EACH COLUMN AND GET A COUNT.
for val in train:
    print(train[val].value_counts())
    print()

In [None]:
train.dtypes

In [None]:
#PRINT THE UNIQUE VALUES IN THE COLUMNS
print(train['Sex'].unique())
print(train['Embarked'].unique())

# CLEANING THE DATASET

In [None]:
title=set()
for name in train["Name"]:
    title.add(name.split(",")[1].split(".")[0].strip())
print(title)    

In [None]:
titles={'Sir':"officer",'Mme':"normald",'Mr':"norm",'Master':"normald",'Don':"officer",'Miss':"normald",'Lady':"normald",'Mlle':"normald",'Col':"officer",'Ms':"normald",'the Countess':"royal",'Mrs':"normald",'Major':"officer",'Capt':"officer",'Dr':"officer",'Rev':"officer",'Jonkheer':"royal"}


In [None]:
train["Name"]=train["Name"].map(lambda name:name.split(",")[1].split(".")[0].strip())
train["Name"]=train.Name.map(titles)
train.head(10)

In [None]:
#DROP THE UNWANTED FIELDS.
df=train.drop(["Ticket","Cabin","PassengerId"],axis=1)
df.head()

In [None]:
#CONVERTING THE STRING VALUES(CATEGORICAL VALUES) TO INTEGER
df.Sex=df.Sex.map({"female":0,"male":"1"})
df.Embarked=df.Embarked.map({"S":0,"C":"1","Q":2})
df.Name=df.Name.map({"royal":0,"normald":1,"officer":2,"norm":3})
df.head()

In [None]:
df.isnull().sum()

In [None]:
#REPLACING ALL THE NULL VALUES.
df.fillna(df.Age.mean(),inplace=True)
round(df["Age"],1)
df.head(10)

In [None]:
df.isnull().sum()

In [None]:
#GET SOME STATISTICS.
df.describe()

# FEATURE SCALLING.

In [None]:
#THE STANDARDSCALER ASSUMES DATA IS NORMALLYDISTRIBUTED WITHIN EACH FEATURE AND SCALES THEM SUCH 
#THAT THE DISTRIBUTION CENTERED AROUND 0, WITH A STANDARD DEVIATION BY 1.

from sklearn.preprocessing import StandardScaler
df["Age"]=round((df.Age-df.Age.mean()/df.Age.std()))
df["Fare"]=round((df.Fare-df.Fare.mean()/df.Fare.std()))


In [None]:
df.head()

In [None]:
df.head()

# DATA MODELLING

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(
     df.drop(["Survived"],axis=1),
     df.Survived,test_size=0.2,
     random_state=0,
     stratify=df.Survived)

# CREATE A FUNCTION WITH LOGISTIC REGRESSION

In [None]:
def models(x_train,y_train):
    # USE LOGISTIC REGRESSION
    from sklearn.linear_model import LogisticRegression
    log=LogisticRegression(random_state=0)
    log.fit(x_train,y_train)
    
    

    
    
    #PRINT THE TRAINING ACCURACY FOR LOGISTIC REGRESSION MODEL.
    print('Logistic Regression training accuracy:',log.score(x_train,y_train))
    
    return log

In [None]:
#GET AND TRAIN ALL THE MODELS
model=models(x_train,y_train)

# PRINT THE PREDICTION ON LOGISTIC REGRESSION MODEL

In [None]:
title=set()
for name in test["Name"]:
    title.add(name.split(",")[1].split(".")[0].strip())
print(title)    

In [None]:


titles={'Sir':"officer",'Mme':"normald",'Mr':"norm",'Master':"normald",'Don':"officer",'Miss':"normald",'Lady':"normald",'Mlle':"normald",'Lady':"normald",'Mlle':"normald",'Col':"officer",'Ms':"normald",'the Countess':"royal",'Mrs':"normald",'Major':"officer",'Capt':"officer",'Dr':"officer",'Rev':"officer",'Jonkheer':"royal"}

In [None]:
test["Name"]=test["Name"].map(lambda name:name.split(",")[1].split(".")[0].strip())
test["Name"]=test.Name.map(titles)
test.head(10)

In [None]:
#DROPPING UNWANTED FIELDS.
df1=test.drop(["Ticket","Cabin","PassengerId"],axis=1)
df1.head()

In [None]:
#CONVERTING THE STRING VALUES(CATEGORICAL VALUES) TO INTEGER
df1.Sex=df1.Sex.map({"female":0,"male":"1"})
df1.Embarked=df1.Embarked.map({"S":0,"C":"1","Q":2})
df1.Name=df1.Name.map({"royal":0,"normald":1,"officer":2,"norm":3})
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
#REPLACING ALL THE NULL VALUES
df1.fillna(df1.Age.mean(),inplace=True)

In [None]:
df1.isnull().sum()

In [None]:
df1["Age"]=round((df1.Age-df1.Age.mean()/df1.Age.std()))
df1["Fare"]=round((df1.Fare-df1.Fare.mean())/df1.Fare.std())


In [None]:
df1.head()

In [None]:
#PRINT THE PREDICTION.
pred=model.predict(df1)
pred