### Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

%matplotlib inline 
plt.rcParams['figure.figsize']=(8,5)

### Analyzing the Dataset

In [None]:
df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
df.head()

In [None]:
print(df.shape)

Our target variable is Survived

In [None]:
df.info()

Need to convert datatypes accordingly

In [None]:
sns.heatmap(df.isnull(), cbar = False)
plt.show()

Dropping the insignificant variables

In [None]:
df.drop(["Cabin","Name","PassengerId","Ticket"],axis = 1,inplace = True)
df.head()

In [None]:
df["Sex"] = df["Sex"].astype(str)
df["Embarked"] = df["Embarked"].astype(str)

### Visualization

In [None]:
sns.countplot("Sex", data = df, hue ="Survived")
plt.show()    

0 - Not Survived , 1 - Survived

From the above plot we come to know that number of female's were survived when compared to male.

In [None]:
x= [0,1,2]
sns.countplot("Pclass", data = df, hue ="Survived")
labels = ["1st Class","2nd Class","3rd Class"]
plt.xticks(x, labels)
plt.show()    


0 - Not Survived, 1 - Survived

From the above plot we come to know that the people in 1st Class were mostly survived and people in 3rd class were mostly lost their life

In [None]:
x = [0,1,2,3]
sns.countplot("Embarked", data = df, hue ="Survived")
labels = ["Southampton","Cherbourg", "Queenstown", "Unknown"]
plt.xticks(x,labels)
plt.show()    

### Encoding 

In [None]:
le = LabelEncoder()

In [None]:
for val in df.columns:
    if df[val].dtype == "O":
        df[val] = le.fit_transform(df[val])
df.head()

In [None]:
def fillmissing_val(col):
    mean = col.mean()
    col.fillna(mean, inplace = True)
fillmissing_val(df["Age"])
fillmissing_val(df["Fare"])
fillmissing_val(df["Embarked"])

In [None]:
df["Age"] = np.round(df["Age"])
df["Age"] = df["Age"].astype(int)

In [None]:
df.isnull().sum()

In [None]:
correlation = df.corr()
sns.heatmap(correlation,annot = True)
plt.show()

Age has the highest correlation with survived variable

In [None]:
plt.figure(figsize=(12,6))
sns.countplot("Age",data = df, hue = "Survived")
plt.xticks(range(0,90,5))
plt.show()    

0 - Not Survived, 1 - Survived
From the above chart we come to know that the people between the 35 to 40 were survived the most.

### Train Test and Split

In [None]:
X = df.drop(columns = "Survived",axis =1)
Y = df["Survived"]

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y, test_size = 0.2, random_state = 0)

### Linear Regression

In [None]:
model = LinearRegression()

##### Train Module

In [None]:
model.fit(xtrain,ytrain) 
xtrain_predict = model.predict(xtrain)
xtrain_predict = np.round(xtrain_predict).astype(int)

train_predict = pd.DataFrame({"Predicted Value" : xtrain_predict,
                              "Actual Value" : ytrain
})

In [None]:
train_predict.head()

In [None]:
train_accuracy = accuracy_score(xtrain_predict,ytrain)
print(train_accuracy)

##### Test module

In [None]:
xtest_predict = model.predict(xtest)
xtest_predict = np.round(xtest_predict).astype(int)

test_predict = pd.DataFrame({"Predicted Value" : xtest_predict,
                              "Actual Value" : ytest
})

In [None]:
test_predict.head()

In [None]:
test_accuracy = accuracy_score(xtest_predict,ytest)
print(test_accuracy)