In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.impute import KNNImputer
from scipy import stats

In [None]:
df=pd.read_csv('../input/engineering-graduate-salary-prediction/Engineering_graduate_salary.csv')

In [None]:
pd.options.display.max_columns =None
df.head(20)

In [None]:
df.shape

In [None]:
def data_preprocess(df):
    
    # maping and encoding
    
    df['Degree'] = df.Degree.map({"B.Tech/B.E." :1 ,"M.Tech./M.E." : 2, "MCA" : 3,"M.Sc. (Tech.)" : 4,})
    
    df['Gender'] = df.Gender.map({"m" : 1 ,"f" : 2})
    
    df= pd.get_dummies(df,columns=['CollegeState'], drop_first=True)
    
    # taking only year from the DOB
    
    df['Birth_year'] = pd.to_datetime(df.DOB, format='%Y/%m/%d').dt.year
    
     # droping unnecessary columns
    
    df.drop(["ID","CollegeID","CollegeCityID","DOB"], axis =1, inplace =True)
    
    df = df.replace(-1, np.nan)
    
    # Using knn imputer to treat missing values
    
    K_imputer = KNNImputer()
    K_imputer.fit(df)
    
    df = pd.DataFrame(K_imputer.transform(df), columns =df.columns)
    
    return df
    
    

In [None]:
{columns : len(df[columns].unique()) for columns in df.columns}

In [None]:
from sklearn.preprocessing import LabelEncoder
Lencode = LabelEncoder()
df['10board']=Lencode.fit_transform(df['10board'])
df['12board']=Lencode.fit_transform(df['12board'])
df['Specialization']=Lencode.fit_transform(df['Specialization'])


In [None]:
df =data_preprocess(df)

In [None]:
df.select_dtypes('object')

In [None]:
plt.figure(figsize=(16,20))
for i in range(len(df.columns)):
    plt.subplot(10,10, i+1)
    sns.boxplot(df[df.columns[i]])

In [None]:
def find_outliers(df, threshold):
    
    # getting the threshol value
    
    t_zscore = stats.norm.ppf(threshold)
    
    # getting the zscore for each value
    
    zscore = pd.DataFrame(np.abs(stats.zscore(df)), columns = df.columns)
    
    return (zscore > t_zscore). sum(axis = 0)

In [None]:
stats.norm.ppf(0.9999)

In [None]:
find_outliers(df, 0.9999)

In [None]:
def remove_outliers(df, threshold):
    
    # getting the threshold value
    
    t_zscore = stats.norm.ppf(threshold)
    
    # getting the zscore for each value
    
    zscore = pd.DataFrame(np.abs(stats.zscore(df)), columns = df.columns)
    zscore = (zscore > t_zscore)
    
    # get the indecies
    
    outliers = zscore.sum(axis=1)
    outliers = outliers > 0
    outliers_index = df.index[outliers] 
    
    df = df.drop(outliers_index, axis =0)
    
    return df


In [None]:
df = remove_outliers(df, 0.9999)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
plt.figure(figsize=(16,20))
for i in range(len(df.columns)):
    plt.subplot(10,10, i+1)
    sns.boxplot(df[df.columns[i]])

In [None]:
X = df.drop('Salary', axis=1)
y = df['Salary']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [None]:
X_train, X_test, y_train, y_test,= train_test_split(X,y, train_size = 0.7, random_state = 1)

In [None]:
sc.fit(X_train)

X_train = pd.DataFrame(sc.transform(X_train))
X_test = pd.DataFrame(sc.transform(X_test))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

In [None]:
R = Ridge()
param = {'alpha': [1e-20,1e-15,1e-10,1e-5 ,1e-2, 1,3,5,10,15,20,25,30,50,65,80]}
GSCV = GridSearchCV(R,param,cv=5)
GSCV.fit(X_train,y_train)

In [None]:
print(GSCV.best_params_)
print(GSCV.best_score_)

In [None]:
predict = GSCV.predict(X_test)

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predict))
print('MSE:', metrics.mean_squared_error(y_test, predict))
print('RMSE:', np.sqrt(metrics.mean_absolute_error(y_test, predict)))

In [None]:
import seaborn as sns
sns.distplot(y_test-predict)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
predictlr = lr.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictlr))
print('MSE:', metrics.mean_squared_error(y_test, predictlr))
print('RMSE:', np.sqrt(metrics.mean_absolute_error(y_test, predictlr)))

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor()

In [None]:
xgb.fit(X_train,y_train)

In [None]:
predictxgb = xgb.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictxgb))
print('MSE:', metrics.mean_squared_error(y_test, predictxgb))
print('RMSE:', np.sqrt(metrics.mean_absolute_error(y_test, predictxgb)))

In [None]:
xgb.score(X_test,y_test)

In [None]:
lr.score(X_test,y_test)

In [None]:
GSCV.score(X_test,y_test)

In [None]:
import seaborn as sns
sns.distplot(y_test-predictxgb)