In [None]:

import numpy as np
import pandas as pd

## Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


## Machine Learning Libraries

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

## Regression Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Cross Val Score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler


import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv('data.csv')

In [None]:
data.head()

In [None]:
data[data['specialisation'] == 'Mkt&Fin']

In [None]:
## Info About Dataset,j
data.info()

In [None]:
data.isnull().sum()

In [None]:
## Students which are not placed has salary 0.
data['salary'] = data['salary'].replace(to_replace = np.nan, value= 0.0)

In [None]:
data.isnull().sum()

In [None]:
def remove_duplicates(data):
    data.drop_duplicates(keep='first',inplace=True)
    return "Checked Duplicates"
remove_duplicates(data)

## Explore Categorical Columns

In [None]:
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

In [None]:
categorical_columns

In [None]:
def more_info_cate_data(dataset, categorical_col):
    for index, col_name in enumerate(categorical_col):
        print(categorical_col[index],'=', tuple(dataset[col_name].unique()))

In [None]:
more_info_cate_data(data, categorical_columns)

In [None]:
## More Info About Categorical Data

def more_info_cate_data(dataset, categorical_col):
    for index, col_name in enumerate(categorical_col):
        print('Total Number of Unique Variables in ',categorical_col[index],'columns is',dataset[col_name].nunique() ,'its Unique values are', dataset[col_name].unique())
        print(dataset[col_name].value_counts())
        print('*'*100)

In [None]:
more_info_cate_data(data, categorical_columns)

## Visualize the Categorical columns

In [None]:
for x in categorical_columns:
    plt.figure() #this creates a new figure on which your plot will appear
    sns.countplot(data[x],data=data);
    plt.xlabel(x)
    plt.ylabel('Salary')
    plt.title(x)
    plt.xticks(rotation=90)

In [None]:
data.describe()

In [None]:
### Dropped unimportant columns

data.drop(['sl_no'],axis = 1, inplace=True)

## Relationships of Percentages with respect to Salary.

In [None]:
numerical_columns = data.select_dtypes(exclude=['object']).columns.tolist()
print(numerical_columns)
for x in numerical_columns:
    plt.figure()
    sns.regplot(x=data[x], y=data['salary'], data=data);

### Boxplot of numerical data to check the outliers

In [None]:
for x in numerical_columns:
    plt.figure()
    sns.boxplot(data[x]);

In [None]:
data.describe()

In [None]:
## Remove Outliers From Salary Columns
data = data[data['salary'] != 940000.0 ] 

In [None]:
data.describe()

In [None]:
## Encoding Categorical Columns
categorical_columns

In [None]:
data

## Encoding Categorical Columns

In [None]:
from sklearn.preprocessing import LabelEncoder

le_gender = LabelEncoder()
data['gender'] = le_gender.fit_transform(data['gender'])
print('gender',data['gender'].unique())

le_ssc_b = LabelEncoder()
data['ssc_b'] = le_ssc_b.fit_transform(data['ssc_b'])
print('ssc_b',data['ssc_b'].unique())

le_hsc_b = LabelEncoder()
data['hsc_b'] = le_hsc_b.fit_transform(data['hsc_b'])
print('hsc_b',data['hsc_b'].unique())

le_hsc_s = LabelEncoder()
data['hsc_s']  = le_hsc_s.fit_transform(data['hsc_s'])
print('hsc_s',data['hsc_s'].unique())

le_degree_t = LabelEncoder()
data['degree_t']  = le_degree_t.fit_transform(data['degree_t'])
print('degree_t',data['degree_t'].unique())

le_workex = LabelEncoder()
data['workex']  = le_workex.fit_transform(data['workex'])
print('workex',data['workex'].unique())

le_specialisation = LabelEncoder()
data['specialisation']  = le_specialisation.fit_transform(data['specialisation'])
print('specialisation',data['specialisation'].unique())

le_status = LabelEncoder()
data['status']  = le_status.fit_transform(data['status'])
print('status',data['status'].unique())


## Split data into trianing and testing

In [None]:
X = data.drop('salary', axis = 1)
y = data['salary']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
## Feature Importance 
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

In [None]:
ranked_features = pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(13).plot(kind='barh')
plt.show()

## Machine Learning Models

In [None]:
LinearRegression_Model = LinearRegression()
DecisionTreeRegression_Model = DecisionTreeRegressor()
RandomForestRegression_Model = RandomForestRegressor()
Xgboost_Model = xgb.XGBRegressor()
Ridge_Model =  Ridge()
Lasso_Model = Lasso()
ElasticNet_Model = ElasticNet()

ml_models = [LinearRegression_Model, DecisionTreeRegression_Model, RandomForestRegression_Model, Xgboost_Model, Ridge_Model, Lasso_Model, ElasticNet_Model]

In [None]:
best_accuracies = 0.0
best_regressor = 0

model_dict = {0:'LinearRegression', 1: 'DecisionTreeRegressor', 2: 'RandomForestRegressor', 3:'XgboostRegressor',4:'RidgeRegression',5:'LassoRegression',6:'ElsticNetRegression'}

In [None]:
for model in ml_models:
    model.fit(X_train, y_train)

In [None]:
for i, models in enumerate(ml_models):
    y_pred = models.predict(X_test)
    print( model_dict[i],models.score(X_test, y_test))
    plt.figure(figsize=(12,8))
    ax1 = sns.distplot(y_test, color='r', hist=False, label='Test Distribution')
    ax2 = sns.distplot(y_pred, color='b', hist=False, label= 'Predicted Distribution')
    plt.legend()
    plt.title('Test vs Predicted Distribution')
    plt.show()

In [None]:
for i, models in enumerate(ml_models):
    print('Cross Validation Score of',model_dict[i],'is', (cross_val_score(models,X,y,cv=10)).mean()) 

In [None]:
for i, models in enumerate(ml_models):
    y_pred = models.predict(X_test)
    print( model_dict[i],np.sqrt(mean_squared_error(y_test, y_pred))) 

# Hyperparameter Optimization of Models

In [None]:
## Metric Function

## Metrics Functions

In [None]:
def metric(y_test, y_pred):
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import median_absolute_error
    
    mean_error = mean_squared_error(y_test, y_pred)
    root_mean_squared_error = np.sqrt(mean_error)
    score = r2_score(y_test, y_pred)
    absolute_error = mean_absolute_error(y_test, y_pred)
    median_error = median_absolute_error(y_test, y_pred)
    
    print('Mean-Squared-Error:{}'.format(mean_error))
    print('Root-Mean-Squared-Error:{}'.format(root_mean_squared_error))
    print('Score:{}'.format(score))
    print('Absolute Error:{}'.format(absolute_error))
    print('Median-Absolute-Error:{}'.format(median_error))
    
    plt.figure(figsize=(12,8))
    ax1 = sns.distplot(y_test, color='r', hist=False, label='Test Distribution')
    ax2 = sns.distplot(y_pred, color='b', hist=False, label= 'Predicted Distribution')
    plt.legend()
    plt.title('Test vs Predicted Distribution')
    plt.show()

## Hyperparameter Optimization of Linear Regression Model

In [None]:
def multiple_regression(x_train, x_test, y_train, y_test):
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import GridSearchCV
    regressor = LinearRegression()
    
    ## Parameters to tune
    print('Parameters to be tune:{}'.format(LinearRegression().get_params().keys()))
    
    params = {"copy_X":[True, False],
             "fit_intercept":[True, False],
             "normalize":[True,False]}
    grid = GridSearchCV(regressor, params, cv=5, scoring='r2',n_jobs=-1)
    grid.fit(x_train, y_train)
    y_pred = grid.predict(x_test)
    
    print('Best Hyperparameters Used:{}'.format(grid.best_params_))
    
    result = metric(y_test, y_pred)
    
    

In [None]:
multiple_regression(X_train, X_test, y_train, y_test)

## Hyperparameter Optimization of Decision Tree Regressor

In [None]:
def decision_tree(x_train, x_test, y_train, y_test):
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.model_selection import GridSearchCV
    
    regressor = DecisionTreeRegressor()
    
    ## Parameters to be tune
    print('Hyper-Parameters to be tune: {}'.format(DecisionTreeRegressor().get_params()))
    
    ## Setting and tuning the hyperparameters
    
    params = {"criterion":['mse','mae'],
             "min_samples_split":[10,20,30,40],
             "max_depth":[2,4,6,8],
             "min_samples_leaf":[20, 40,60,100],
             "max_leaf_nodes":[5,10,20,30]}
    grid = GridSearchCV(regressor, params, cv=5)
    grid.fit(x_train, y_train)
    y_pred = grid.predict(x_test)
    
    ## Displaying the best hyperparameters used
    print("Best Hyper-Parameters used : {}".format(grid.best_params_))
    
    ## Setting the Metrics
    result = metric(y_test, y_pred)

In [None]:
decision_tree(X_train, X_test, y_train, y_test)

## Hyperparameter Optimization of Random Forest Regressor

In [None]:
def random_forest(x_train, x_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import GridSearchCV
    
    regressor = RandomForestRegressor()
    
    ## Parameters to be tune
    print('Hyper-Parameters to be tune: {}'.format(RandomForestRegressor().get_params()))
    
    ## Setting and tuning the hyperparameters
    
    params = {"n_estimators":[10,20,30,40],
             "max_features":['auto', 'log2', 'sqrt'],
             "bootstrap":[True, False]}
    grid = GridSearchCV(regressor, params, cv=5)
    grid.fit(x_train, y_train)
    y_pred = grid.predict(x_test)
    
    ## Displaying the best hyperparameters used
    print("Best Hyper-Parameters used : {}".format(grid.best_params_))
    
    ## Setting the Metrics
    result = metric(y_test, y_pred)

In [None]:
random_forest(X_train, X_test, y_train, y_test)

## Hyperparameter of XGBoost Regressor Model

In [None]:
def XGBoost(x_train, x_test, y_train, y_test):
    from xgboost import XGBRegressor
    from sklearn.model_selection import GridSearchCV
    import warnings
    warnings.filterwarnings("ignore")
    
    regressor = XGBRegressor()
    
    ## Parameters to be tune
    print('Hyper-Parameters to be tune: {}'.format(XGBRegressor().get_params()))
    
    ## Setting and tuning the hyperparameters
    
    params = {"nthread":[3],
             "learning_rate":[0.01, 0.03, 0.05, 0.06],
             "max_depth":[4,5,6,8],
             "min_child_weight":[4],
             "subsample":[0.7],
             "colsample_bytree":[0.7],
             "n_estimators":[500]}
    grid = GridSearchCV(regressor, params, cv=5)
    grid.fit(x_train, y_train)
    y_pred = grid.predict(x_test)
    
    ## Displaying the best hyperparameters used
    print("Best Hyper-Parameters used : {}".format(grid.best_params_))
    
    ## Setting the Metrics
    result = metric(y_test, y_pred)

In [None]:
XGBoost(X_train, X_test, y_train, y_test)

In [None]:
model  =  xgb.XGBRegressor(colsample_bytree= 0.7, learning_rate = 0.01, max_depth= 4, min_child_weight= 4, n_estimators=500, nthread= 3, subsample= 0.7)

In [None]:
X_test_ = X_test.values
X_train_ = X_train.values

In [None]:
model.fit(X_train_,y_train)

In [None]:
y_pred = model.predict(X_test_)

In [None]:
score = model.score(X_test_, y_test)

In [None]:
score

## Test Model

In [None]:
X_test.columns

In [None]:
X = np.array([['M',67.0,'Others',91.0,'Others','Commerce',58.0,'Sci&Tech','No',55.0,'Mkt&Fin',58.80,'Placed']])
X

In [None]:
categorical_columns


In [None]:
X[:,0] = le_gender.transform(X[:,0])
X[:,2] = le_ssc_b.transform(X[:,2])
X[:,4] = le_hsc_b.transform(X[:,4])
X[:,5] = le_hsc_s.transform(X[:,5])
X[:,7] = le_degree_t.transform(X[:,7])
X[:,8] = le_workex.transform(X[:,8])
X[:,10] = le_specialisation.transform(X[:,10])
X[:,12] = le_status.transform(X[:,12])

X = X.astype(float)
X

In [None]:
y_pred = model.predict(X)
y_pred

## Saved the Model in Pickle Format

In [None]:
import pickle

In [None]:
data = {"model":model,"le_gender": le_gender, "le_ssc_b":le_ssc_b, "le_hsc_b":le_hsc_b, "le_hsc_s":le_hsc_s,"le_degree_t":le_degree_t,"le_workex":le_workex,"le_specialisation":le_specialisation,"le_status":le_status}
with open('xgboost_model.pkl','wb') as file:
    pickle.dump(data,file)

In [None]:
with open('xgboost_model.pkl', 'rb') as file:
    data = pickle.load(file)
    
regressor_loaded = data['model']
le_gender = data['le_gender']
le_ssc_b = data['le_ssc_b']
le_hsc_b = data['le_hsc_b']
le_hsc_s = data['le_hsc_s']
le_degree_t = data['le_degree_t']
le_workex = data['le_workex']
le_specialisation = data['le_specialisation']
le_status = data['le_status']



In [None]:
y_pred = regressor_loaded.predict(X)

In [None]:
y_pred