# **TPS-APRIL TITANIC DATASET COMPETITION**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train_data=pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test_data=pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

# **OVERVIEW**

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
train_data.describe()

In [None]:
train_data.isnull().sum()

In [None]:
train_data.describe(include=['O'])

In [None]:
train_data.corr()

In [None]:
sns.pairplot(data=train_data)

# **ANALYSIS OF COLUMNS**

## **AGE**

In [None]:
#Age column is normally distributed a little right skewed , now we won't change it
plt.figure(figsize=(15,6))
sns.histplot(train_data['Age'],kde=True)

In [None]:
plt.figure(figsize=(15,6))
sns.histplot(np.log(train_data['Age']),kde=True)

In [None]:
plt.figure(figsize=(15,6))
sns.histplot(data=train_data,x="Age",hue="Survived",binwidth=2,multiple='stack')

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(data=train_data,x="Age")
#AS there is not a any outlier in the "Age" column , we can continue 

**Filling the age value with mean value , we can also fill it with median value and with the help of regression method by taking "Age" as a target value**

In [None]:
#Filling the "Age" column with mean value 
train_data["Age"]=train_data['Age'].fillna(train_data['Age'].mean())
print(train_data["Age"].mean())

## **FARE**

In [None]:
#Right Skewed data
plt.figure(figsize=(15,6))
sns.histplot(train_data["Fare"],kde=True,binwidth=2)

In [None]:
#By taking log of "fare" column data no more normally distributed , so we will continue without lof
plt.figure(figsize=(15,6))
sns.histplot(np.log(train_data['Fare']),kde=True)

In [None]:
plt.figure(figsize=(15,6))
sns.histplot(data=train_data,x="Fare",hue="Survived",binwidth=2,multiple='stack')

In [None]:
#train_data["Fare"]=train_data['Fare'].fillna(train_data['Fare'].median())
train_data["Fare"]=train_data['Fare'].fillna(train_data['Fare'].mean())
train_data['Fare'].mean()

In [None]:
train_data.isnull().sum()

In [None]:
train_data['Name'].value_counts()

In [None]:
train_data['Cabin'].value_counts()

**Name : This column has 92144 unique values and it doesn't contribute to the data because we can't decide whether the person will survive or not based on his/her . But yes , I designations were there i.e Mr, Mrs , Captain then we can consider it as a feature**

**Ticket : This column has different values either numeric,alphanumeric and it is not contributing to the results**

**Cabin : This column has more than 65K null values that means we don't know where are most of the people in the ship**

**PassengerId : This column is just a series number from 1 to 100k , it is of no use**

In [None]:
#1).Taking first letter from cabin columns and trying to analyse which cabin have more survival rate
#2). We can use this as a feature but filling of more than 65k values with mode will be foolish
#3). Regression Method may help but we will not consider this for now
df=pd.DataFrame({'Cabin_pro':train_data['Cabin'].str[0],'Survive':train_data['Survived']})
plt.figure(figsize=(15,6))
sns.countplot(data=df,x=df["Cabin_pro"],hue=df['Survive'])

In [None]:
train_data=train_data.drop(['Name'],axis=1)
train_data=train_data.drop(['Ticket'],axis=1)
train_data=train_data.drop(['Cabin'],axis=1)
train_data=train_data.drop(['PassengerId'],axis=1)

In [None]:
train_data.head()

## **Pclass,Sex and Embarked**

In [None]:
train_data['Pclass'].value_counts()

In [None]:
#People with pclass 3 has not-survived more
plt.figure(figsize=(15,6))
sns.countplot(data=train_data,x=train_data['Pclass'],hue=train_data['Survived'])

In [None]:
train_data[['Survived','Pclass']].groupby('Pclass',as_index='False').mean().sort_values(by='Survived',ascending=False)

In [None]:
train_data['Sex'].value_counts()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(data=train_data,x=train_data['Sex'],hue=train_data['Survived'])

In [None]:
train_data["Embarked"].value_counts()

In [None]:
train_data[['Survived','Embarked']].groupby('Embarked',as_index='False').mean().sort_values(by='Survived',ascending=False)

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(data=train_data,x=train_data['Embarked'],hue=train_data['Survived'])

**Embarked has 250 null values so it's obvious we can fill it with mode value**

In [None]:
train_data['Embarked']=train_data["Embarked"].fillna(train_data['Embarked'].dropna().mode()[0])


In [None]:
train_data.isnull().sum()

# **MANUAL ENCODING**

In [None]:
train_data["AgeBand"]=pd.cut(train_data["Age"],4)
train_data[['Survived','AgeBand']].groupby('AgeBand',as_index='False').mean().sort_values(by='Survived',ascending=False)

In [None]:
train_data.loc[train_data['Age']<=21.81,'Age']=0
train_data.loc[(train_data['Age']>21.81) & (train_data['Age']<=43.54),'Age']=1
train_data.loc[(train_data['Age']>43.54) & (train_data['Age']<=65.27),'Age']=2
train_data.loc[(train_data['Age']>65.27) & (train_data['Age']<=87),'Age']=3
train_data.loc[train_data['Age']>87,'Age']
    
train_data.head()
    

In [None]:
train_data["FareBand"]=pd.cut(train_data["Fare"],3)
train_data[['Survived','FareBand']].groupby('FareBand',as_index='False').mean().sort_values(by='Survived',ascending=False)

In [None]:
train_data.loc[train_data['Fare']<=248.673,'Fare']=0
train_data.loc[(train_data['Fare']>248.673) & (train_data['Fare']<=496.667),'Fare']=1
train_data.loc[(train_data['Fare']>496.667) & (train_data['Fare']<=744.66),'Fare']=2
train_data.loc[train_data['Fare']>744.66,'Fare']
    
train_data.head()
    

In [None]:
#Dropping "AgeBand" and "FareBand" Column
train_data=train_data.drop(['AgeBand','FareBand'],axis=1)

In [None]:
#Mapping "Sex" column 
train_data['Sex']=train_data['Sex'].map({'male':0,'female':1}).astype(int)

In [None]:
#Mapping "Embarked" Column
train_data["Embarked"]=train_data['Embarked'].map({"S":0,"C":1,"Q":2}).astype(int)

In [None]:
train_data.head(10)

In [None]:
#Making new Column "FamilySize"="Parch"+"SibSp"
train_data["FamilySize"]=train_data["Parch"]+train_data["SibSp"]+1
train_data["FamilySize"].value_counts()

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(data=train_data,x='FamilySize',hue='Survived')

**Can we do something here with familysize column?
Yes We can encode it 0 and 1 values based on survival rate and size of family , but but we can't make our data too simple for the model . So we are leaving it without encoding**

In [None]:
train_data[['Survived','FamilySize']].groupby('FamilySize',as_index='False').mean().sort_values(by='Survived',ascending=False)

**AS we can see as there is not significant +ve or -ve correlation between the any two columns so we are not going to drop any column and will continue with these features**

In [None]:
g=train_data.corr()
plt.figure(figsize=(15,6))
sns.heatmap(g,annot=True)

In [None]:
train_data=train_data.drop(['SibSp',"Parch"],axis=1)

In [None]:
train_data

# **TEST DATA**

### Same working as a train_data , you can skip if you want

In [None]:
test_data=pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
test_data.head(10)

In [None]:
test_data.isnull().sum()

In [None]:
test_data=test_data.drop(['Ticket'],axis=1)
test_data=test_data.drop(['Cabin'],axis=1)
test_data=test_data.drop(['PassengerId'],axis=1)
test_data=test_data.drop(['Name'],axis=1)

In [None]:
#Age column is normally distributed and little right skewed
plt.figure(figsize=(15,6))
sns.histplot(test_data["Age"],kde=True)

In [None]:
plt.figure(figsize=(15,6))

sns.histplot(np.log(test_data["Age"]),kde=True)

In [None]:
test_data["Age"]=test_data['Age'].fillna(test_data['Age'].mean())

In [None]:
plt.figure(figsize=(15,6))
sns.histplot(test_data["Fare"],kde=True)

In [None]:
plt.figure(figsize=(15,6))
sns.histplot(np.log(test_data["Fare"]),kde=True)

In [None]:
test_data["Fare"]=test_data['Fare'].fillna(test_data['Fare'].mean())

In [None]:
test_data['Embarked']=test_data["Embarked"].fillna(test_data['Embarked'].dropna().mode()[0])

In [None]:
test_data.loc[test_data['Age']<=21.81,'Age']=0
test_data.loc[(test_data['Age']>21.81) & (test_data['Age']<=43.54),'Age']=1
test_data.loc[(test_data['Age']>43.54) & (test_data['Age']<=65.27),'Age']=2
test_data.loc[(test_data['Age']>65.27) & (test_data['Age']<=87),'Age']=3
test_data.loc[test_data['Age']>87,'Age']
    
test_data.head(20)


In [None]:
test_data.loc[test_data['Fare']<=248.673,'Fare']=0
test_data.loc[(test_data['Fare']>248.673) & (test_data['Fare']<=496.667),'Fare']=1
test_data.loc[(test_data['Fare']>496.667) & (test_data['Fare']<=744.66),'Fare']=2
test_data.loc[test_data['Fare']>744.66,'Fare']
    
test_data.head(100)


In [None]:
test_data['Sex']=test_data['Sex'].map({'male':0,'female':1}).astype(int)

In [None]:
test_data["Embarked"]=test_data['Embarked'].map({"S":0,"C":1,"Q":2}).astype(int)

In [None]:
test_data.head()

In [None]:
test_data["FamilySize"]=test_data["Parch"]+test_data["SibSp"]+1

In [None]:
test_data=test_data.drop(['SibSp',"Parch"],axis=1)

# **MODEL AND PREDICTION**

### **RANDOM FOREST CLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
import optuna

In [None]:
y_train=train_data['Survived']
x_train=train_data.drop(['Survived'],axis=1)



In [None]:
def objective_rf(trial):
    y_train=train_data['Survived']
    x_train=train_data.drop(['Survived'],axis=1)

    n_estimators = trial.suggest_int('n_estimators', 2, 300)
    max_depth = int(trial.suggest_loguniform('max_depth', 2, 50))
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    return cross_val_score(clf, x_train, y_train, 
           n_jobs=-1, cv=5,scoring='f1').mean()


In [None]:
study_rf= optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=40)


In [None]:
trial_rf= study_rf.best_trial
print(trial_rf.value)
print(trial_rf.params)

In [None]:
model_rf=RandomForestClassifier(n_estimators= 200, max_depth= 6)


In [None]:
res_rf=model_rf.fit(x_train,y_train)

In [None]:
pred_rf=res_rf.predict(test_data)

In [None]:
test_data1=pd.read_csv('/content/test.csv')
res=pd.DataFrame({"PassengerId":test_data1['PassengerId'],"Survived":pred_rf})
res.to_csv("submission_ary_rf",index=False)

### **LightGBM Classifier**

In [None]:
import lightgbm as lgb

In [None]:
def objective_lgbm(trial):
    y_train=train_data['Survived']
    x_train=train_data.drop(['Survived'],axis=1)

    n_estimators = trial.suggest_int('n_estimators', 2, 300)
    max_depth = int(trial.suggest_int('max_depth', 2, 50))
    learning_rate=trial.suggest_loguniform('learning_rate',0.001,1)
    colsample_bytree=trial.suggest_loguniform("colsample_bytree",0.1, 1)
    num_leaves=trial.suggest_int('num_leaves',10,300)
    reg_alpha= trial.suggest_loguniform('reg_alpha',0.1,1)
    reg_lambda= trial.suggest_loguniform('reg_lambda',0.1,1)
    min_split_gain=trial.suggest_loguniform('min_split_gain',0.1,1)
    subsample=trial.suggest_loguniform('subsample',0.1,1)    
    clf = lgb.LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth,
                            learning_rate=learning_rate,colsample_bytree=colsample_bytree,
                            num_leaves=num_leaves,reg_alpha=reg_alpha,reg_lambda=reg_lambda,
                            min_split_gain=min_split_gain,subsample=subsample)
    return cross_val_score(clf, x_train, y_train, 
           n_jobs=-1, cv=5,scoring='f1').mean()


In [None]:
study_lgbm= optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=40)


In [None]:
trial_lgbm= study_lgbm.best_trial
print(trial_lgbm.value)
print(trial_lgbm.params)

In [None]:
model_lgbm=lgb.LGBMClassifier(n_estimators=244, max_depth=2, learning_rate=0.1811549889997839, 
                              colsample_bytree=0.8014731846804227, num_leaves=195, reg_alpha=0.6526035476086431, 
                              reg_lambda=0.17416065696048297, 
                              min_split_gain=0.437591342130419,subsample= 0.4082158418003157)

In [None]:
clf_lgbm=model_lgbm.fit(x_train,y_train)


In [None]:
pred=clf_lgbm.predict(test_data)

In [None]:
res1=pd.DataFrame({"PassengerId":test_data1['PassengerId'],"Survived":pred})
res1.to_csv("submission_ary_LGBM",index=False)

## **Got 78.9% accuracy on making submissions**