# Prediction Job Satisfaction of Kaggle Members

**Presentation and more info: https://drive.google.com/file/d/1mVdBrK_yxfsh8duJYXe5-57jyAloOnd7/view?usp=sharing** 

# Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.utils import shuffle
from matplotlib import pyplot
import seaborn as sns
!pip install pycountry-convert

import pycountry_convert as pc
from sklearn import linear_model
from sklearn.model_selection import KFold,cross_val_score
from statistics import mean
from sklearn.linear_model import RidgeCV
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV


%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Collecting pycountry-convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl (13 kB)
Collecting pprintpp>=0.3.0
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl (16 kB)
Collecting pytest-cov>=2.5.1
  Downloading pytest_cov-2.12.1-py2.py3-none-any.whl (20 kB)
Collecting coverage>=5.2.1
  Downloading coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl (242 kB)
[K     |████████████████████████████████| 242 kB 6.6 MB/s 
[?25hCollecting pytest-mock>=1.6.3
  Downloading pytest_mock-3.6.1-py3-none-any.whl (12 kB)
Collecting repoze.lru>=0.7
  Downloading repoze.lru-0.7-py3-none-any.whl (10 kB)
Installing collected packages: coverage, repoze.lru, pytest-mock, pytest-cov, pprintpp, pycountry-convert
Successfully installed coverage-5.5 pprintpp-0.4.0 pycountry-convert-0.7.2 pytest-cov-2.12.1 pytest-mock-3.6.1 repoze.lru-0.7
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m
/kaggle/input/cs412-fall2020/test.xlsx
/kaggl

The libraries which we are using in this notebook

# Reading Data and Merging

In [2]:
train_df = pd.read_excel('/kaggle/input/cs412-fall2020/train.xlsx')
test_df = pd.read_excel('/kaggle/input/cs412-fall2020/test.xlsx')


We pop the label data from train data. 

In [3]:
train_df = train_df[train_df.Age != 0]# One person is 0 years old (should drop)
train_df = train_df[train_df.Age != 1]# One person is 1 years old (should drop)
test_df = test_df[test_df.Age != 0]
test_df = test_df[test_df.Age != 1]

count_row = train_df.shape[0]  # gives number of row count
count_col = train_df.shape[1] 
y = train_df.pop("JobSatisfaction")#label column



Show the percentages of NaN values in each column

In [4]:
train_df.isnull().mean() * 100

ID                                         0.000000
GenderSelect                               0.180930
Country                                    0.289488
Age                                        1.230324
EmploymentStatus                           0.000000
CodeWriter                                 0.000000
CurrentJobTitleSelect                      0.036186
TitleFit                                   1.845486
CurrentEmployerType                        1.266510
MLToolNextYearSelect                       4.161390
MLMethodNextYearSelect                     5.011760
LanguageRecommendationSelect               3.528135
LearningPlatformUsefulnessBlogs           54.224715
LearningPlatformUsefulnessKaggle          42.699475
LearningPlatformUsefulnessCourses         46.788493
LearningPlatformUsefulnessProjects        54.785598
LearningPlatformUsefulnessSO              45.811471
LearningPlatformUsefulnessTextbook        60.177311
LearningPlatformUsefulnessYouTube         56.341596
DataScienceI

## Dropping Column with High perncentage of null values

We drop the datas which have got null value percentage higher than %35 percentages (which are not important)

In [5]:

train_df = train_df.drop(columns=[
                "WorkMethodsFrequencyCross-Validation",
                 "WorkMethodsFrequencyDataVisualization","WorkMethodsFrequencyDecisionTrees",
                 "WorkMethodsFrequencyLogisticRegression","WorkMethodsFrequencyNeuralNetworks",
                "WorkMethodsFrequencyPCA","WorkMethodsFrequencyRandomForests",
                 "WorkMethodsFrequencyTimeSeriesAnalysis"])
train_df = train_df.reset_index(drop=True)

test_df = test_df.drop(columns=[
                "WorkMethodsFrequencyCross-Validation",
                 "WorkMethodsFrequencyDataVisualization","WorkMethodsFrequencyDecisionTrees",
                 "WorkMethodsFrequencyLogisticRegression","WorkMethodsFrequencyNeuralNetworks",
                "WorkMethodsFrequencyPCA","WorkMethodsFrequencyRandomForests",
                 "WorkMethodsFrequencyTimeSeriesAnalysis"])
test_df = test_df.reset_index(drop=True)

# Handling with Features

Label column is "JobSatisfaction".

Numerical features are "ID,Age,CompensationScore"

Other features are nominal or ordinal features.

## 2- Related Columns

This function is handle the nan values of two columns which are related with each other.

In [6]:
def comparefornull(change,effects,df_to_change):
    tool_df = train_df.groupby(by=[effects]).sum()   
    ml_df = train_df[[change,effects]]
    tools = []
    for row in tool_df.index: 
        tools.append(row)
    ml_dict = dict((l,0) for l in tools)


    for i in tools:
        sub_df = ml_df[ml_df[effects]== i]
        frequent = sub_df[change].mode().values[0]
        ml_dict[i] = frequent


    fill_df = df_to_change[[change,effects]]
    is_NaN = fill_df.isnull().any(axis=1)
    rows_with_NaN = fill_df[is_NaN]


    for i,row in rows_with_NaN.iterrows():
        if pd.isna(row[change]) == True:
            if pd.isna(row[effects]) == False:
                df_to_change.loc[i,change] = ml_dict[row[effects]]

In [7]:
comparefornull("TitleFit","CurrentJobTitleSelect",train_df)
comparefornull("CurrentJobTitleSelect","DataScienceIdentitySelect",train_df)

comparefornull("TitleFit","CurrentJobTitleSelect",test_df)
comparefornull("CurrentJobTitleSelect","DataScienceIdentitySelect",test_df)


## Merge Test and Train Data

We merge train data and test data to do the operations on the data once.

In [8]:
frames = [train_df, test_df] #create a list and append train data and test data

temp_df = pd.concat(frames) #merge databases in list
temp_df = temp_df.reset_index(drop=True)


## Change Null Value with Mode of Column

This funtion change the NaN values in a given column with mode value of that column.

In [9]:
def moder (column):
    temp_df[column] = temp_df[column].fillna(train_df[column].mode()[0])

In [10]:
moder("GenderSelect")
moder("Country")
moder("EmploymentStatus")
moder("CodeWriter")
moder("CurrentJobTitleSelect")
moder("TitleFit")
moder("CurrentEmployerType")
moder("MLToolNextYearSelect")
moder("MLMethodNextYearSelect")
moder("LanguageRecommendationSelect")
moder("DataScienceIdentitySelect")
moder("FormalEducation")
moder("MajorSelect")
moder("Tenure")
moder("PastJobTitlesSelect")
moder("MLSkillsSelect")
moder("MLTechniquesSelect")
moder("EmployerIndustry")
moder("EmployerSize")
moder("WorkProductionFrequency")
moder("WorkAlgorithmsSelect")
moder("WorkDataVisualizations")
moder("WorkInternalVsExternalTools")
moder("WorkMLTeamSeatSelect")
moder("RemoteWork")
moder("Age")
moder("CompensationScore")

temp_df["CurrentEmployerType"]= temp_df["CurrentEmployerType"].astype(str)


## Map Ordinal Features' Columns:

DataScienceIdentitySelect, CodeWriter, TitleFit, Tenure,

EmployerSize, WorkProductionFrequency, WorkToolsFrequencyPython, 

WorkDataVisualizations, RemoteWork, FormalEducation

In [11]:
DataScienceIdentitySelect_map = {'No':0,'Yes':2,'Sort of (Explain more)':1}
temp_df['DataScienceIdentitySelect'] = temp_df['DataScienceIdentitySelect'].replace(DataScienceIdentitySelect_map)



In [12]:
Codewriter_map = {'No':0,'Yes':1}
temp_df['CodeWriter'] = temp_df['CodeWriter'].replace(Codewriter_map)




In [13]:
TitleFit_map = {'Fine':1,'Perfectly':2,'Poorly':0}
temp_df['TitleFit'] = temp_df['TitleFit'].replace(TitleFit_map)



In [14]:
Tenure_map = {'1 to 2 years':2,'3 to 5 years':3,'6 to 10 years':4,
          "I don't write code to analyze data":0,'Less than a year':1,
          'More than 10 years':5}
temp_df['Tenure'] = temp_df['Tenure'].replace(Tenure_map)


In [15]:
EmployerSize_map = {'1,000 to 4,999 employees':7,'10 to 19 employees':3,
              '10,000 or more employees':9,"100 to 499 employees":5,
              '20 to 99 employees':4,'5,000 to 9,999 employees':8,
              '500 to 999 employees':6,'Fewer than 10 employees':2,
              "I don't know":1,'I prefer not to answer':0}
temp_df['EmployerSize'] = temp_df['EmployerSize'].replace(EmployerSize_map)


In [16]:
wpf_map = {'Always':5,'Most of the time':4,
              'Never':1,"Don't know":0,
              'Sometimes':3,'Rarely':2}
temp_df['WorkProductionFrequency'] = temp_df['WorkProductionFrequency'].replace(wpf_map)


In [17]:
visual_map = {'51-75% of projects':4,"76-99% of projects":5,
              '26-50% of projects':3,'10-25% of projects':2,
              "Less than 10% of projects":1,'None':0,'100% of projects':6}
temp_df['WorkDataVisualizations'] = temp_df['WorkDataVisualizations'].replace(visual_map)


In [18]:
RemoteWork_map = {'Always':5,'Most of the time':4,
              'Never':1,"Don't know":0,
              'Sometimes':3,'Rarely':2}
temp_df['RemoteWork'] = temp_df['RemoteWork'].replace(RemoteWork_map)


In [19]:
edu_map = {'Doctoral degree':6,"Master's degree":5,
              "Bachelor's degree":3,"Professional degree":4,
              'I did not complete any formal education past high school':1,
           'I prefer not to answer':0,
           "Some college/university study without earning a bachelor's degree":2}
temp_df['FormalEducation'] = temp_df['FormalEducation'].replace(edu_map)


In [20]:
def learnplatf(col_name):
  wtfpython_map = {'Very useful':2,
              'Somewhat useful':1,'Not Useful':0}
  temp_df[col_name] = temp_df[col_name].replace(wtfpython_map)

In [21]:
def workchallenge(col_name):
  wtfpython_map = {'Often':3,'Most of the time':4,
              'Sometimes':2,'Rarely':1}
  temp_df[col_name] = temp_df[col_name].replace(wtfpython_map)


In [22]:
workchallenge("WorkChallengeFrequencyPolitics")
workchallenge("WorkChallengeFrequencyUnusedResults")
workchallenge("WorkChallengeFrequencyDirtyData")
workchallenge("WorkChallengeFrequencyExplaining")
workchallenge("WorkChallengeFrequencyTalent")
workchallenge("WorkChallengeFrequencyClarity")
workchallenge("WorkChallengeFrequencyDataAccess")

workchallenge("WorkToolsFrequencySQL")
workchallenge("WorkToolsFrequencyR")
workchallenge("WorkToolsFrequencyPython")

learnplatf("LearningPlatformUsefulnessBlogs")
learnplatf("LearningPlatformUsefulnessKaggle")
learnplatf("LearningPlatformUsefulnessCourses")
learnplatf("LearningPlatformUsefulnessProjects")
learnplatf("LearningPlatformUsefulnessSO")
learnplatf("LearningPlatformUsefulnessTextbook")
learnplatf("LearningPlatformUsefulnessYouTube")



In [23]:
def moderforwork (column):
    temp_df[column] = temp_df[column].fillna(0)

moderforwork("WorkChallengeFrequencyPolitics")
moderforwork("WorkChallengeFrequencyUnusedResults")
moderforwork("WorkChallengeFrequencyDirtyData")
moderforwork("WorkChallengeFrequencyExplaining")
moderforwork("WorkChallengeFrequencyTalent")
moderforwork("WorkChallengeFrequencyClarity")
moderforwork("WorkChallengeFrequencyDataAccess")

moderforwork("WorkToolsFrequencySQL")
moderforwork("WorkToolsFrequencyR")
moderforwork("WorkToolsFrequencyPython")

moderforwork("LearningPlatformUsefulnessBlogs")
moderforwork("LearningPlatformUsefulnessKaggle")
moderforwork("LearningPlatformUsefulnessCourses")
moderforwork("LearningPlatformUsefulnessProjects")
moderforwork("LearningPlatformUsefulnessSO")
moderforwork("LearningPlatformUsefulnessTextbook")
moderforwork("LearningPlatformUsefulnessYouTube")



## Grouping Age Column and Mapping

In [24]:
bins= [13,30,50,101]
labels = ['Young','Adult','Old']
temp_df['Age'] = pd.cut(temp_df['Age'], bins=bins, labels=labels, right=False)

Age_map = {'Old':2,'Young':0,'Adult':1}
temp_df['Age'] = temp_df['Age'].replace(Age_map)



# Nominal Features

"GenderSelect", "Country",

"EmploymentStatus","CurrentJobTitleSelect", 
                                             
"CurrentEmployerType","MLToolNextYearSelect**",
                                             
"MLMethodNextYearSelect","LanguageRecommendationSelect",
                                             
"MajorSelect","PastJobTitlesSelect",
                                             
"MLSkillsSelect","MLTechniquesSelect","EmployerIndustry",
                                             
"WorkAlgorithmsSelect","WorkInternalVsExternalTools",

"WorkMLTeamSeatSelect"

## Handle the Columns that have got more than one value

In [25]:
def splitter(col_name):
  emp_list = temp_df[col_name].unique()
  emp_list = [i.split(',') for i in emp_list] 
  temp_df[col_name] = temp_df[col_name].str.split(",")
  emp_list = [j for i in emp_list for j in i]
  emp_list = list(set(emp_list))
  return emp_list

def create_col(col):
  temp_df[col] = 0

def change_values(list_name,col_name):
  for i in list_name:
    create_col(i)

  for i in range(len(temp_df)):
    t_list = []
    t_list = temp_df.loc[i,col_name]
    for k in range(len(t_list)):
      temp_df.loc[i, t_list[k]] = 1


In [26]:


list_work = splitter("CurrentEmployerType")
change_values(list_work,"CurrentEmployerType")

list_work = splitter("WorkAlgorithmsSelect")
change_values(list_work,"WorkAlgorithmsSelect")


## Counting Map

In [27]:
frequency = train_df.EmployerIndustry.value_counts().to_dict()
temp_df.EmployerIndustry=temp_df.EmployerIndustry.map(frequency)
frequency


{'Technology': 1082,
 'Academic': 884,
 'Financial': 580,
 'Mix of fields': 532,
 'Other': 521,
 'Internet-based': 400,
 'Government': 302,
 'CRM/Marketing': 205,
 'Manufacturing': 191,
 'Telecommunications': 189,
 'Insurance': 181,
 'Retail': 152,
 'Non-profit': 86,
 'Pharmaceutical': 84,
 'Hospitality/Entertainment/Sports': 63,
 'Military/Security': 63}

In [28]:
frequency = train_df.LanguageRecommendationSelect.value_counts().to_dict()
temp_df.LanguageRecommendationSelect=temp_df.LanguageRecommendationSelect.map(frequency)
frequency

{'Python': 3322,
 'R': 1364,
 'SQL': 220,
 'C/C++/C#': 109,
 'Matlab': 94,
 'Scala': 60,
 'Java': 47,
 'SAS': 40,
 'Other': 37,
 'Julia': 15,
 'Stata': 12,
 'Haskell': 10,
 'F#': 2}

## Categorize Countries by their Continent

In [29]:
def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name


country = temp_df.groupby(by=["Country"]).sum()   
ct = []
for row in country.index: 
    ct.append(row)
ct_dict = dict((l,0) for l in ct)
for i in range(len(ct)):
  country_name = ct[i]
  if country_name == "People 's Republic of China" or country_name == "Republic of China":
    countinent_name = "Asia"
    ct_dict[country_name] = countinent_name
  elif country_name != "Other":
    countinent_name = country_to_continent(country_name)
    ct_dict[country_name] = countinent_name


temp_df = temp_df.replace({"Country": ct_dict})                
temp_df = pd.get_dummies(temp_df, columns=["Country"])


## One Hot Encoding (k-1 for k values)

In [30]:
temp_df = pd.get_dummies(temp_df, columns=["GenderSelect",
"EmploymentStatus", 
"CurrentJobTitleSelect",                                                                                         
"MajorSelect" ,
"WorkMLTeamSeatSelect"
], drop_first=True)

## Seperate Test and Train Data (Drop Some Columns)

In [31]:
test_df = temp_df.iloc[count_row:,:]
temp_df = temp_df.iloc[:count_row,:]

print(test_df.shape)

test_df = test_df.drop(columns = ["WorkAlgorithmsSelect","MLTechniquesSelect"
                                  ,"CurrentEmployerType","WorkInternalVsExternalTools"
                                  ,"PastJobTitlesSelect","MLSkillsSelect"
                                ,"MLMethodNextYearSelect","MLToolNextYearSelect"
                                ])
test_df = test_df.reset_index(drop=True)

(1000, 107)


In [32]:
temp_df = temp_df.drop(columns = ["WorkAlgorithmsSelect","MLTechniquesSelect"
                                  ,"CurrentEmployerType","WorkInternalVsExternalTools"
                                  ,"PastJobTitlesSelect","MLSkillsSelect"
                                ,"MLMethodNextYearSelect","MLToolNextYearSelect"
                                ])
temp_df = temp_df.reset_index(drop=True)

# ML ALGORITHMS

Create a new dataframe for compare predictions of each algorithm.

In [33]:
df_empty = test_df[["ID","CompensationScore"]]
df_empty = df_empty.rename(columns={'CompensationScore': 'linear'})
df_empty["linear"] = df_empty["linear"]*0
df_empty["Ridge"] = df_empty["linear"]
df_empty["gradient"]= df_empty["linear"]
df_empty["elastic"]= df_empty["linear"]
df_empty["lasso"]= df_empty["linear"]
df_empty["bayes"]= df_empty["linear"]

x = temp_df.drop(columns=["ID"])
test = test_df.drop(columns=["ID"])

x = x.reset_index(drop=True)
test = test.reset_index(drop=True)

## Linear Regression

In [34]:

folds = KFold(n_splits = 5)
scores = []
lin_model = LinearRegression()
for n_fold, (train_index, valid_index) in enumerate(folds.split(x,y)):   
    x_train, x_val = x.iloc[list(train_index)], x.iloc[list(valid_index)]
    y_train, y_val = y.iloc[list(train_index)], y.iloc[list(valid_index)]
    
    
    lin_model.fit(x_train, y_train)
    y_pred = lin_model.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    pred_test = lin_model.predict(test)
    df_empty["linear"] += pred_test
    scores.append(rmse)
df_empty["linear"] = df_empty["linear"]/5

print(df_empty["linear"])
print(mean(scores))


0      9.208505
1      7.322153
2      7.092672
3      6.952042
4      8.204510
         ...   
995    8.166856
996    6.705332
997    5.325575
998    7.392472
999    6.508757
Name: linear, Length: 1000, dtype: float64
1.9502265957873015


## Ridge CV

In [35]:
folds = KFold(n_splits = 5)
scores = []
ridge_model = RidgeCV()
for n_fold, (train_index, valid_index) in enumerate(folds.split(x,y)):   
    x_train, x_val = x.iloc[list(train_index)], x.iloc[list(valid_index)]
    y_train, y_val = y.iloc[list(train_index)], y.iloc[list(valid_index)]
    
    
    ridge_model.fit(x_train, y_train)
    y_pred = ridge_model.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    pred_test = ridge_model.predict(test)
    df_empty["Ridge"] += pred_test
    scores.append(rmse)
df_empty["Ridge"] = df_empty["Ridge"]/5

print(df_empty["Ridge"])
print(mean(scores))


0      9.192778
1      7.320973
2      7.077314
3      6.957574
4      8.220035
         ...   
995    8.197991
996    6.723941
997    5.330887
998    7.433579
999    6.472391
Name: Ridge, Length: 1000, dtype: float64
1.9461972578909348


## Gradient Boosting


In [36]:
folds = KFold(n_splits = 5)
scores = []
reg = GradientBoostingRegressor(random_state=25)
for n_fold, (train_index, valid_index) in enumerate(folds.split(x,y)):   
    x_train, x_val = x.iloc[list(train_index)], x.iloc[list(valid_index)]
    y_train, y_val = y.iloc[list(train_index)], y.iloc[list(valid_index)]
    
    
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    pred_test = reg.predict(test)
    df_empty["gradient"] += pred_test
    scores.append(rmse)
df_empty["gradient"] = df_empty["gradient"]/5

print(df_empty["gradient"])
print(mean(scores))


0      8.854505
1      7.430425
2      7.233597
3      6.630643
4      8.456239
         ...   
995    8.102595
996    6.561562
997    5.643555
998    7.638048
999    6.627700
Name: gradient, Length: 1000, dtype: float64
1.9482844297280597


## LassoCV


In [37]:
folds = KFold(n_splits = 5)
scores = []
lasso = LassoCV()
for n_fold, (train_index, valid_index) in enumerate(folds.split(x,y)):   
    x_train, x_val = x.iloc[list(train_index)], x.iloc[list(valid_index)]
    y_train, y_val = y.iloc[list(train_index)], y.iloc[list(valid_index)]
    
    
    lasso.fit(x_train, y_train)
    y_pred = lasso.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    pred_test = lasso.predict(test)
    df_empty["lasso"] += pred_test
    scores.append(rmse)
df_empty["lasso"] = df_empty["lasso"]/5

print(df_empty["lasso"])
print(mean(scores))


0      8.730651
1      7.464859
2      7.040546
3      6.815829
4      8.231543
         ...   
995    7.897943
996    6.516030
997    5.702439
998    7.366443
999    6.449866
Name: lasso, Length: 1000, dtype: float64
1.9527511839424092


## ElasticCV


In [38]:
folds = KFold(n_splits = 5)
scores = []
Elastic = ElasticNetCV()
for n_fold, (train_index, valid_index) in enumerate(folds.split(x,y)):   
    x_train, x_val = x.iloc[list(train_index)], x.iloc[list(valid_index)]
    y_train, y_val = y.iloc[list(train_index)], y.iloc[list(valid_index)]
    
    
    Elastic.fit(x_train, y_train)
    y_pred = Elastic.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    pred_test = Elastic.predict(test)
    df_empty["elastic"] += pred_test
    scores.append(rmse)
df_empty["elastic"] = df_empty["elastic"]/5

print(df_empty["elastic"])
print(mean(scores))


0      8.686600
1      7.452194
2      7.043894
3      6.813882
4      8.150200
         ...   
995    7.828917
996    6.498252
997    5.740350
998    7.354791
999    6.455319
Name: elastic, Length: 1000, dtype: float64
1.954535794000465


## Bayesian Ridge


In [39]:
folds = KFold(n_splits = 5)
scores = []
clf = linear_model.BayesianRidge()
for n_fold, (train_index, valid_index) in enumerate(folds.split(x,y)):   
    x_train, x_val = x.iloc[list(train_index)], x.iloc[list(valid_index)]
    y_train, y_val = y.iloc[list(train_index)], y.iloc[list(valid_index)]
    
    
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    pred_test = clf.predict(test)
    df_empty["bayes"] += pred_test
    scores.append(rmse)
df_empty["bayes"] = df_empty["bayes"]/5

print(df_empty["bayes"])
print(mean(scores))



0      9.054468
1      7.404147
2      7.087795
3      6.865012
4      8.134673
         ...   
995    8.067023
996    6.699982
997    5.450531
998    7.517395
999    6.406867
Name: bayes, Length: 1000, dtype: float64
1.9417965784190443


## Prediction of Algorithms



In [40]:
df_empty

Unnamed: 0,ID,linear,Ridge,gradient,elastic,lasso,bayes
0,1,9.208505,9.192778,8.854505,8.686600,8.730651,9.054468
1,2,7.322153,7.320973,7.430425,7.452194,7.464859,7.404147
2,3,7.092672,7.077314,7.233597,7.043894,7.040546,7.087795
3,4,6.952042,6.957574,6.630643,6.813882,6.815829,6.865012
4,5,8.204510,8.220035,8.456239,8.150200,8.231543,8.134673
...,...,...,...,...,...,...,...
995,996,8.166856,8.197991,8.102595,7.828917,7.897943,8.067023
996,997,6.705332,6.723941,6.561562,6.498252,6.516030,6.699982
997,998,5.325575,5.330887,5.643555,5.740350,5.702439,5.450531
998,999,7.392472,7.433579,7.638048,7.354791,7.366443,7.517395


# Output Creation

We choose bayesian ridge because it gives the best rsme score.

In [41]:
output = df_empty[["ID","bayes"]]
output = output.rename(columns={'bayes': 'Prediction'})

output.describe()

Unnamed: 0,ID,Prediction
count,1000.0,1000.0
mean,500.5,6.792856
std,288.819436,0.935905
min,1.0,3.943252
25%,250.75,6.159394
50%,500.5,6.841939
75%,750.25,7.450792
max,1000.0,9.405892


In [42]:
output.to_csv("submission.csv", index=False)

# Future Work

Neural Network gives better result so we will implement it later.