In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import neighbors
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df1= pd.read_csv('../input/titanic/train.csv')
df1

In [None]:
valid = pd.read_csv('../input/titanic/test.csv')
valid

In [None]:
df1.head()

In [None]:
df1.tail()

In [None]:
df1.describe()

In [None]:
df1.info()

In [None]:
df1.isna().sum()

In [None]:
df1.isna().mean()

In [None]:
def missing (df1):
    missing_number = df1.isnull().sum().sort_values(ascending=False)
    missing_percent = ((df1.isnull().sum()/df1.isnull().count())*100).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values

In [None]:
missing(df1)

In [None]:
for col in df1.columns:
    if df1[col].isnull().mean()*100>40:
        df1.drop(col,axis=1,inplace=True)

In [None]:
df1

In [None]:
df1.columns

In [None]:
sns.countplot(df1.dtypes.map(str))
plt.show()

In [None]:
df1.dtypes.value_counts()

In [None]:
f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
df1 = df1.fillna(df1.groupby('SibSp').transform(f))
df1

In [None]:
for col in df1.columns:
    if df1[col].dtypes != object:
        q1 = df1[col].quantile(0.25)
        q2 = df1[col].quantile(0.50)
        q3 = df1[col].quantile(0.75)
        IQR = q3 - q1
        llp = q1-1.5*IQR
        ulp = q3+1.5*IQR
        print('column name',col)
        print('q1',q1)
        print('q2',q2)
        print('q3',q3)
        print('IQR',IQR)
        print('llp',llp)
        print('ulp',ulp)
        print('mean:',df1[col].mean())
        print('median:',df1[col].median())
        print('mode',df1[col].mode()[0])
        print('skewness:',df1[col].skew())
        print('kurtosis:',df1[col].kurtosis())
        print('std',df1[col].std())
        print('max',df1[col].max())
        print('min',df1[col].min())
        print('null_value count:',df1[col].isnull().sum())
        print('\n')

In [None]:
df1.dtypes

In [None]:
df1['PassengerId'].unique()

In [None]:
df1['Survived'].unique()

In [None]:
df1['Pclass'].unique()

In [None]:
df1['Name'].unique()

In [None]:
df1['Sex'].unique()

In [None]:
df1['Age'].unique()

In [None]:
df1['SibSp'].unique()

In [None]:
df1['Parch'].unique()

In [None]:
df1['Ticket'].unique()

In [None]:
df1['Fare'].unique()

In [None]:
df1['Embarked'].unique()

In [None]:
Q1 = df1.quantile(0.25)
Q3 = df1.quantile(0.75)
IQR = Q3 - Q1
print('outliers count of each columns')
((df1 < (Q1 - 1.5 * IQR)) | (df1 > (Q3 + 1.5 * IQR))).sum()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df1.corr(), cmap="plasma_r", annot=True)

In [None]:
plt.figure(figsize=(10,5))
sns.countplot('Sex', data = df1)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x= 'Sex', y = 'Fare', data = df1) 
plt.legend()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Age', data = df1, palette='GnBu')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Fare', data = df1, palette='GnBu')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='Pclass', data = df1, palette='GnBu')

In [None]:
sns.countplot(x='SibSp', data = df1)
plt.show()
sns.countplot(x='Survived', data = df1)
plt.show()

In [None]:
sns.scatterplot(x = 'Fare', y = 'Name', data = df1)
plt.show()

In [None]:
sns.scatterplot(x = 'Sex', y = 'Pclass', data = df1)
plt.show()

In [None]:
plt.figure(figsize=(9,6))
sns.heatmap(df1.corr(),annot=True)

In [None]:
sns.pairplot(data=df1)

In [None]:
count=1
plt.subplots(figsize=(30,25))
for i in df1.columns:
    if df1[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.distplot(df1[i])
        count+=1

plt.show()

In [None]:
count=1
plt.subplots(figsize=(30,25))
for i in df1.columns:
    if df1[i].dtypes!='object':
        plt.subplot(6,7,count)
        sns.boxplot(df1[i])
        count+=1

plt.show()


In [None]:
df1.dtypes

In [None]:
le=LabelEncoder()
for col in df1.columns:
    if df1[col].dtypes == object:
        df1[col]= le.fit_transform(df1[col])

In [None]:
X=df1.drop('Fare',axis=1)
y=df1['Fare']

In [None]:
X

In [None]:
y

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [None]:
def train_models(X_train, y_train):
    
 #use Decision Tree
   
    tree = DecisionTreeRegressor(max_features=75,max_depth=4, random_state = 0)
    tree.fit(X_train, y_train)
    y_pred_tree = tree.predict(X_test)

  #use the RandomForestRegressor
    
    rf = RandomForestRegressor(n_estimators = 100,max_features =75, random_state = 0)
    rf.fit(X_train, y_train)
    y_pred_rf= rf.predict(X_test)
    
  # use the support vector regressor
    #from sklearn.svm import SVR
    svr= SVR(kernel = 'rbf')
    svr.fit(X_train, y_train)
    y_pred_svr = svr.predict(X_test)
    
    #from sklearn.svm import SVR
    svr_l= SVR(kernel = 'linear')
    svr_l.fit(X_train, y_train)
    y_pred_svr_linear = svr_l.predict(X_test)

    # use the knn regressor
    knn = neighbors.KNeighborsRegressor()
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    
  # metrics of decision tree regressor
    meanAbErr_tree= metrics.mean_absolute_error(y_test, y_pred_tree)
    meanSqErr_tree= metrics.mean_squared_error(y_test, y_pred_tree)
    rootMeanSqErr_tree= np.sqrt(metrics.mean_squared_error(y_test, y_pred_tree))

  # metrics of random forest regressor
    meanAbErr_rf= metrics.mean_absolute_error(y_test, y_pred_rf)
    meanSqErr_rf= metrics.mean_squared_error(y_test, y_pred_rf)
    rootMeanSqErr_rf= np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))

    # metrics of knn regressor
    meanAbErr_knn = metrics.mean_absolute_error(y_test, y_pred_knn)
    meanSqErr_knn = metrics.mean_squared_error(y_test, y_pred_knn)
    rootMeanSqErr_knn= np.sqrt(metrics.mean_squared_error(y_test, y_pred_knn)) 

  # metrics of svr regressor
    meanAbErr_svr = metrics.mean_absolute_error(y_test, y_pred_svr_linear)
    meanSqErr_svr = metrics.mean_squared_error(y_test, y_pred_svr_linear)
    rootMeanSqErr_svr= np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr_linear)) 

  #print the training accurancy of each model:

    print('[1]Decision Tree Training Accurancy: ', r2_score(y_test,y_pred_tree))
    print('Mean Absolute Error:', meanAbErr_tree)
    print('Mean Square Error:', meanSqErr_tree)
    print('Root Mean Square Error:', rootMeanSqErr_tree)
    print('\t')
    print('[2]RandomForestRegressor Training Accurancy: ',r2_score(y_test,y_pred_rf))
    print('Mean Absolute Error:', meanAbErr_rf)
    print('Mean Square Error:', meanSqErr_rf)
    print('Root Mean Square Error:', rootMeanSqErr_rf)
    print('\t')    
    print('[3]SupportvectorRegression Accuracy(rbf): ', r2_score(y_test,y_pred_svr))
    print('\t')
    print('[4]SupportvectorRegression Accuracy(linear): ', r2_score(y_test,y_pred_svr_linear))
    print('Mean Absolute Error:', meanAbErr_svr)
    print('Mean Square Error:', meanSqErr_svr)
    print('Root Mean Square Error:', rootMeanSqErr_svr)
    print('\t')
    print('[5]knn Training Accurancy: ', r2_score(y_test,y_pred_knn))
    print('Mean Absolute Error:', meanAbErr_knn)
    print('Mean Square Error:', meanSqErr_knn)
    print('Root Mean Square Error:', rootMeanSqErr_knn)
    print('\t')

In [None]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()  
mlr.fit(X_train, y_train)

In [None]:
y_pred_mlr= mlr.predict(X_test)
y_pred_mlr

In [None]:
r2_mlr =r2_score(y_test,y_pred_mlr)
print('r2_score:',r2_mlr*100)

In [None]:
valid

In [None]:
missing(valid)

In [None]:
for col in valid.columns:
    if valid[col].isnull().mean()*100>40:
        valid.drop(col,axis=1,inplace=True)

In [None]:
valid

In [None]:
f = lambda x: x.median() if np.issubdtype(x.dtype, np.number) else x.mode().iloc[0]
valid = valid.fillna(valid.groupby('Ticket').transform(f))
valid

In [None]:
valid.columns

In [None]:
le=LabelEncoder()
for col in valid.columns:
    if valid[col].dtypes == 'object':
        valid[col]= le.fit_transform(valid[col])

In [None]:
valid['Embarked'].value_counts()

In [None]:
valid

In [None]:
y_valid = mlr.predict(valid)

In [None]:
y_valid

In [None]:
output = pd.DataFrame({"PassengerId": valid['PassengerId'],"Survived": y_valid})
output

In [None]:
# Save the output
output.to_csv("titanic_ml.csv", index=False)
output.head(10)