### Importing necessary libraries
---

In [None]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import log_loss, mean_squared_error,accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
clas_path = '/kaggle/input/titanik1/titanic_train.csv'
reg_path = '/kaggle/input/salary-data-simple-linear-regression/Salary_Data.csv'

naVals = [None,"None","?",np.nan,""]
clas_df = pd.read_csv(clas_path,na_values=naVals)
reg_df = pd.read_csv(reg_path,na_values=naVals,index_col=None)

# General method to clean missing values on any dataset. 
creating Imputers pipeline, passed to ColumnTransformer

In [None]:
def clean_missing(df):
    from sklearn.pipeline import Pipeline
    from sklearn.impute import SimpleImputer
    from sklearn.compose import ColumnTransformer
    
    df_missing_vals = df.isna().sum()
    df_numericCols = df.select_dtypes(include=['int64','float64']).keys()
    cols_numer_missing = [var for var in df_numericCols if df_missing_vals[var]>0 ]
    
    df_categCols = df.select_dtypes(include=['object']).keys()
    cols_categ_missing = [var for var in df_categCols if df_missing_vals[var]>0 ]
    
    numer_val_mean_imputer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy='mean')  )  ])
    categ_val_mode_imputer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy='most_frequent')  )  ])
    
    preproc = ColumnTransformer(transformers=[('mean_imputer',numer_val_mean_imputer,cols_numer_missing),
                                              ('mode_imputer',categ_val_mode_imputer,cols_categ_missing) 
                                             ])
    
    df_clean_nul_val = preproc.fit_transform(df)
     
    df_mis_val_solv = pd.DataFrame(df_clean_nul_val,columns=cols_numer_missing+cols_categ_missing)
    
    df.update(df_mis_val_solv)
    return df

### Regression Task
---
### Perform Average method on the salary dataset

In [None]:
reg_df.head()

In [None]:
reg_df.info()

In [None]:
reg_df.isna().sum()

### Converting floats to int for simplicity sake
---

In [None]:
reg_df.YearsExperience = reg_df.YearsExperience.astype(int)
reg_df.Salary = reg_df.Salary.astype(int)
reg_df.info()

In [None]:
X_reg = reg_df.copy()
target = X_reg.pop("Salary")


In [None]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression

### Combining regression models and averaging
---

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_reg, target, train_size=0.70,random_state=42)

Xreg_train, Xreg_test, yreg_train, yreg_test = X_train, X_test, y_train, y_test
  
# initializing all the model objects with default parameters
model_1 = LinearRegression()
model_2 = xgb.XGBRegressor()
model_3 = RandomForestRegressor()
  
# training all the model on the training dataset
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)
  
# predicting the output on the validation dataset
pred_1 = model_1.predict(X_test)
pred_2 = model_2.predict(X_test)
pred_3 = model_3.predict(X_test)
  
# final prediction after averaging on the prediction of all 3 models
pred_final = (pred_1+pred_2+pred_3)/3.0
preds = list(map(lambda x:int(x),pred_final))
# printing the root mean squared error between real value and predicted value

res= {"Y_test":y_test,"PredY":preds}
#pd.DataFrame(res)


## Classification Task
---
### Voting Method - uses various models and select outcome based on voting

In [None]:
clas_df.head()

In [None]:
clas_df.isna().sum()

In [None]:
clas_df_null_free = clean_missing(clas_df)
clas_df_null_free.isna().sum()

In [None]:
clas_df_null_free.info()

In [None]:
X_train = clas_df.copy()
target = X_train.pop("Survived")


#### removing non-important columns

In [None]:
drop_col = ["Name","Cabin","Ticket","PassengerId"]
X_train.drop(drop_col,axis=1,inplace=True)
X_train

In [None]:
X_train.Fare = X_train.Fare.astype(int)
X_train.Age = X_train.Age.astype(int)

### Encoding categoricals

In [None]:
X_train = pd.get_dummies(X_train)
X_train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, target, test_size=0.20)

model_1 = LogisticRegression()
model_2 = SVC()
model_3 = RandomForestClassifier(n_estimators=5)
model_4 = GaussianNB()
final_model = VotingClassifier(
    estimators=[('lr', model_1), ('xgb', model_2), ('rf', model_3),('NB',model_4)], voting='hard')

In [None]:
results = cross_val_score(final_model,X_train,y_train,cv=8)
# training all the model on the train dataset
final_model.fit(X_train, y_train)
pred_final = final_model.predict(X_test)

# printing log loss between actual and predicted value
print('Accuracy: {:.2f}, LogLoss : {:.2f}'.format(accuracy_score(y_test, pred_final),log_loss(y_test, pred_final)))


In [None]:
pd.DataFrame({"Y_test":y_test,"PredictedY":pred_final})

---
## | Lab 22 May |
---

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor()
#Xreg_train, Xreg_test, yreg_train, yreg_test = X_train, X_test, y_train, y_test
model.fit(Xreg_train,yreg_train)
preds = model.predict(Xreg_test)
print("Accuracy %.2f"%(model.score(Xreg_test,yreg_test)*100))

In [None]:
pd.DataFrame({"Y_test":yreg_test,"PredictedY":preds})