In [None]:
# Basic
import numpy as np 
import pandas as pd 

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Scaling
from sklearn.preprocessing import StandardScaler

# train test split
from sklearn.model_selection import train_test_split

# Making Polynomial Features
from sklearn.preprocessing import PolynomialFeatures

# Importing models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Regression Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# To build optimal model using Backward Elimination
import statsmodels.api as sm

# Cross validation
from sklearn.model_selection import cross_val_score

In [None]:
dataset = pd.read_csv('/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col = False)

dataset.head()

## EDA (wrt. to Salary)

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
dataset.isna().sum()

**We can see that the salary for 67 students are not given as they were not placed.**

Remove the rows that have data of unplaced students because we want to develop a model that predicts the salary of a placed student.

In [None]:
dataset.dropna(axis=0, inplace=True)
print(dataset.shape)

We can see that we now have (215 - 67 = ) 148 entries. We have successfully removed the null salary entries. 

We can also remove the 'status' column as we are predicting salaries of the students assuming they were already placed. So, that column would be 'Placed' for every student.

In [None]:
dataset.drop(columns = ['status'], axis=1, inplace=True)
dataset.head(2)

### Salary Distribution

In [None]:
sns.distplot(a = dataset['salary'])
plt.title('Salary Distribution')
plt.xlabel('Salary')
plt.grid(b=True, which='major', color='#666666', linestyle='-')
plt.minorticks_on()
plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
plt.show()

OBSERVATION:

The salary distribution is **centered around 250k**. The range of salaries that are mostly given out lie in the region **200k - 400k**. 

We can also see some outliers >400k.

### 1. Gender

In [None]:
ax = sns.violinplot(x = 'gender', y = 'salary', data = dataset)

medians = dataset.groupby(['gender'])['salary'].median().values
nobs = dataset['gender'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Gender vs Salary')
plt.grid(b=True, which='major', axis='both', color='#666666', linestyle='-')
plt.minorticks_on()
plt.grid(b=True, which='minor', axis='y', color='#999999', linestyle='-', alpha=0.2)
plt.show()

OBSERVATION:

1. The median salary is slightly higher for males than females. 
2. The distribution is also skewed in terms of females getting placed more than males.
3. The range of salaries being offered to males is much higher than the reange of salaries being offered to females.
4. Maximum salary offered to a male is 10,00,000 and that to a female is 7,00,000.

### 2. ssc_p (Senior Secondary Percentage)

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x='ssc_p', y='salary', data = dataset)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('Salary vs SSC Percentage')
plt.show()

OBSERVATION:

There seems to be almost 0 correlation between the senior secondary percentage and the salaries offered to the students.

### 3. ssc_b (Senior Secondary Board)

In [None]:
ax = sns.violinplot(x = 'ssc_b', y = 'salary', data = dataset)

medians = dataset.groupby(['ssc_b'])['salary'].median().values
nobs = dataset['ssc_b'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Salary vs SSC Board')
plt.grid(b=True, which='major', axis='both', color='#666666', linestyle='-')
plt.minorticks_on()
plt.grid(b=True, which='minor', axis='y', color='#999999', linestyle='-', alpha=0.2)
plt.show()

OBSERVATION:

1. The median lies around the same value for both the boards.
2. The range of salaries offered to students of the Central board is more than the range offered to the students of Other boards. 

We can't particularly say if it is a good indicator for the salary. We will check the correlation later and also in the model refinement process.

### 4. hsc_p (Higher Secondary Percentage)

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x='hsc_p', y='salary', data = dataset)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('Salary vs HSC Percentage')
plt.show()

OBSERVATION:

The correlation between salary and HSC percentage can be visualised to be very less. This will not be a very good indicator for the salary offered to the student.

### 5. hsc_b (Higher Secondary Board)

In [None]:
ax = sns.violinplot(x = 'hsc_b', y = 'salary', data = dataset)

medians = dataset.groupby(['hsc_b'])['salary'].median().values
nobs = dataset['hsc_b'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Salary vs HSC Board')
plt.grid(b=True, which='major', axis='both', color='#666666', linestyle='-')
plt.minorticks_on()
plt.grid(b=True, which='minor', axis='y', color='#999999', linestyle='-', alpha=0.2)
plt.show()

OBSERVATION:

1. The median lies around the same value for both the boards.
2. The range of salaries offered to students of the Central board is more than the range offered to the students of Other boards. 

We can't particularly say if it is a good indicator for the salary. We will check the correlation later and also in the model refinement process.

### 6. hsc_s (Higher Secondary Subject)

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'hsc_s', y = 'salary', data = dataset)

medians = dataset.groupby(['hsc_s'])['salary'].median().values
nobs = dataset['hsc_s'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Salary vs HSC Subjects')
plt.show()

OBSERVATION:

1. The range of salaries offered are much greater for Students who took Commerce, and then Science. Arts students have a very small range of salaries offered.
2. The correlation needs to be checked to check if this is a good indicator for salary offered.

### 7. degree_p (Graduation Degree Percentage)

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x='degree_p', y='salary', data = dataset)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('Salary vs Degree Percentage')
plt.show()

OBSERVATION:

The correlation is either very less or 0 between the percentage of the degree and the salary offered.

### 8. degree_t (Graduation Specialisation)

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'degree_t', y = 'salary', data = dataset)

medians = dataset.groupby(['degree_t'])['salary'].median().values
nobs = dataset['degree_t'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Salary vs Graduate Specialisation')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'degree_t', y = 'salary', data = dataset, hue='gender')
plt.title('Salary vs Graduate Specialisation and gender')
plt.show()

OBSERVATION:

1. The range of salaries offered are in the order Comm&Mgmt > Sci&Tech > Others
2. Correlation between the degree specialisation and the salary offered needs to be checked to check if this is a good indicator or not.
3. The range of salaries offered to females (Sci&Tech and Comm&Mgmt fields) was very less compared to the range of salaries offred to males.
4. Females who pursued Other fields were offered a decent range of salaries whereas for males in the same field, there was an absence of the same.

### 9. workex (Work Experience)

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'workex', y = 'salary', data = dataset)

medians = dataset.groupby(['workex'])['salary'].median().values
nobs = dataset['workex'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Salary vs Work Experience')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'workex', y = 'salary', data = dataset, hue='gender')
plt.title('Salary vs Work Experience and Gender')
plt.show()

OBSERVATION:

1. The median of salary offered to students who had some work experience is slightly more than the ones who did not have a work experience.
2. The range of salaries offered to students who had some work experience is a lot higher than the range offered to the ones who did not have work experience.
3. Females with or without work experience were offered a range of salary less than the males.

### 10. etest_p (Employability Test Percentage)

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x='etest_p', y='salary', data = dataset)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('Salary vs Employability Test Percentage')
plt.show()

OBSERVATION:

A less significant positive correlation can be visualised between the salary offered and the employability test percentage.

### 11. specialisation (in MBA)

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'specialisation', y = 'salary', data = dataset)

medians = dataset.groupby(['specialisation'])['salary'].median().values
nobs = dataset['specialisation'].value_counts().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ['n = ' + i for i in nobs]

pos = range(len(nobs))
for tick,label in zip(pos, ax.get_xticklabels()):
    ax.text(pos[tick], medians[tick]+0.04, nobs[tick], horizontalalignment='center', size='x-small', color='w', weight='semibold')

plt.title('Salary vs specialisation in MBA')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
ax = sns.violinplot(x = 'specialisation', y = 'salary', data = dataset, hue='gender')
plt.title('Salary vs specialisation in MBA and gender')
plt.show()

OBSERVATION:

1. The median of salary offered to students who pursued Mkt&Fin is slighty more than the salary offered to students who pursued Mkt&HR in MBA
2. The range of salary offered to students who pursued Mkt&Fin is mugh larger than the salary offered to students who pursued Mkt&HR in MBA.
3. There could be some correlation. 
4. The females who studied in either of the fields were offered a smaller range of salaries than the males.

### 12. mba_p (Percentage in MBA)

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x='mba_p', y='salary', data = dataset)
plt.minorticks_on()
plt.grid(b=True, which='both', axis='both', alpha=0.1)
plt.title('Salary vs MBA Percentage')
plt.show()

OBSERVATION:

A less significant correlation can be visualised between the MBA percentage and the salary offered to the student.

## Preprocessing

In [None]:
dataset.head(1)

Steps we need to take:
1. Remove the first column of serial numbers.
2. We need to change all the categorical variables into hot encoded values.
3. Drop the original categorical variable columns.

In [None]:
# dropping first column

dataset.drop(columns=['sl_no'], axis=1, inplace=True)
dataset.head(1)

In [None]:
# Gender: F coded as 0 and M as 1
dummy = pd.get_dummies(dataset['gender'])
dummy.rename(columns={'M':'Gender'}, inplace=True)

# drop original column 
dataset.drop("gender", axis = 1, inplace=True)

# merge data frame "dataset" and "dummy_variable_1: Gender column" 
df = pd.concat([dummy['Gender'], dataset], axis=1)

df.head(1)

In [None]:
# ssc_b: Central as 1 and Others as 0
dummy = pd.get_dummies(dataset['ssc_b'])
dummy.rename(columns={'Central':'ssc_b'}, inplace=True)

df.drop("ssc_b", axis = 1, inplace=True)

# merge data
df = pd.concat([df.iloc[:, 0:2], dummy['ssc_b'], df.iloc[:, 2:]], axis=1)

df.head(1)

In [None]:
# hsc_b: Central as 1 and Others as 0
dummy = pd.get_dummies(dataset['hsc_b'])
dummy.rename(columns={'Central':'hsc_b'}, inplace=True)

df.drop("hsc_b", axis = 1, inplace=True)

# merge data
df = pd.concat([df.iloc[:, 0:4], dummy['hsc_b'], df.iloc[:, 4:]], axis=1)

df.head(1)

In [None]:
# Higher Secondary Specialisation: Science: 10 and Commerce: 01 and Arts: 00
dummy = pd.get_dummies(df['hsc_s'])
dummy.rename(columns={'Science': 'HS_Sci', 'Commerce': 'HS_Comm'}, inplace=True)
dummy = pd.concat([dummy['HS_Sci'], dummy['HS_Comm']], axis=1)
dummy.head()

# drop original
df.drop('hsc_s', axis=1, inplace=True)

# merge data
df = pd.concat([df.iloc[:, 0:5], dummy, df.iloc[:, 5:]], axis=1)

df.head(1)

In [None]:
# Undergrad specialisation: Sci&Tech: 10 and Comm&Mgmt: 01 and Others: 00
dummy = pd.get_dummies(df['degree_t'])
dummy.rename(columns={'Sci&Tech': 'UG_Sci', 'Comm&Mgmt': 'UG_Comm'}, inplace=True)
dummy = pd.concat([dummy['UG_Sci'], dummy['UG_Comm']], axis=1)
dummy.head()

# drop original
df.drop('degree_t', axis=1, inplace=True)

# merge data
df = pd.concat([df.iloc[:, 0:8], dummy, df.iloc[:, 8:]], axis=1)

df.head(1)

In [None]:
# Work experience: Yes as 1 nd No as 0
dummy = pd.get_dummies(df['workex'])
dummy.rename(columns={'Yes': 'workex'}, inplace=True)
# dummy.head()

# drop original
df.drop('workex', axis=1, inplace=True)

# merge data
df = pd.concat([df.iloc[:, 0:10], dummy['workex'], df.iloc[:, 10:]], axis=1)

df.head(1)

In [None]:
# Specialisation: Mkt&Fin as 1 and Mkt&HR as 0
dummy = pd.get_dummies(df['specialisation'])
dummy.rename(columns={'Mkt&Fin': 'specialisation'}, inplace=True)
# dummy.head()

# drop original data
df.drop('specialisation', axis=1, inplace=True)

# merge data
df= pd.concat([df.iloc[:, 0:12], dummy['specialisation'], df.iloc[:, 12:]], axis=1)

df.head(1)

**Correlation between all variables**

In [None]:
plt.figure(figsize=(14, 12))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation between all features and salary offered')
plt.show()

In [None]:
# acquiring data for model

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

print('X_shape {}'.format(X.shape))
print('y_shape {}'.format(y.shape))

In [None]:
# Splitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print('Shape of training set: {} and test set: {}'.format(X_train.shape, X_test.shape))

## Model Construction

**We will try models to see which model suits the data best.**

### 1. Multiple Linear Regression

In [None]:
# Making regressor
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting test values
y_pred = regressor.predict(X_test)

# Model performance through metrics
print('Train Score: ', regressor.score(X_train, y_train))  
print('Test Score: ', regressor.score(X_test, y_test)) 
print()
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('R2 score: ', r2_score(y_test, y_pred))

### 2. Polynomial Regression

In [None]:
# Creating Polynomial Features
poly_reg = PolynomialFeatures(degree = 3)
X_train_poly = poly_reg.fit_transform(X_train)
X_test_poly = poly_reg.fit_transform(X_test)

# Fitt PolyReg to training set
regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)

# Predicting test values
y_pred = regressor.predict(X_test_poly)

# Model performance through metrics
print('Train Score: ', regressor.score(X_train_poly, y_train))  
print('Test Score: ', regressor.score(X_test_poly, y_test)) 
print()
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('R2 score: ', r2_score(y_test, y_pred))

### 3. Support Vector Regression

In [None]:
# Applying feature scaling for this

sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.fit_transform(X_test)

print('Scaled Successfully')

In [None]:
regressor = SVR(kernel='rbf')
regressor.fit(X_train_sc, y_train)

# Predicting test values
y_pred = regressor.predict(X_test_sc)

# Model performance through metrics
print('Train Score: ', regressor.score(X_train_sc, y_train))  
print('Test Score: ', regressor.score(X_test_sc, y_test)) 
print()
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('R2 score: ', r2_score(y_test, y_pred))

### 4. Decision Tree Regressor

In [None]:
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

# Predicting test values
y_pred = regressor.predict(X_test)

# Model performance through metrics
print('Train Score: ', regressor.score(X_train, y_train))  
print('Test Score: ', regressor.score(X_test, y_test)) 
print()
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('R2 score: ', r2_score(y_test, y_pred))

### 5. Random Forest Regression


In [None]:
regressor = RandomForestRegressor(n_estimators = 10)
regressor.fit(X_train, y_train)

# Predicting test values
y_pred = regressor.predict(X_test)

# Model performance through metrics
print('Train Score: ', regressor.score(X_train, y_train))  
print('Test Score: ', regressor.score(X_test, y_test)) 
print()
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('R2 score: ', r2_score(y_test, y_pred))

Of all the models tried above:

**Multiple Linear Regression works best**, 

followed by **Random Forest Regression**, and 

then **SVR**.

We will thus, go forward with the **Multiple Linear Regression** and **Random Forest Regression** and will try to better the model.

## Backward Elimination for Multiple LR

We will use Backward Elimination to find out the variables right for our multiple Linear Regression Model.

STEPS:
1. Select significance level to stay in the model (SL = 0.05)
2. Fit the full model will all predictors
3. Consider predictor with highest p-value. If p>SL, go to S4, else FINISH.
4. Remove the predictor.
5. Fit model without the predictor. Go back to S3.

FINISH.

In [None]:
# x_0 has to be given here explicitly because this package does not take in the b_0 constant otherwise.
X_new = df.iloc[:, :-1].values
X_new = np.append(arr = np.ones((148,1)).astype(int), values = X_new, axis = 1)

print(X_new.shape)

### Iteration 1

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_new[:, [0,1,2,3,4,5,6,7,8,9,10,11,12,13, 14]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 2

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,5,6,7,8,9,10,11,12,13,14]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 3

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,3,4,5,6,8,9,10,11,12]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 4

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,4,5,6,7,8,9,10,11]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 5

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,3,4,5,6,8,9,10]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 6

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,3,4,5,6,7,8,9]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 7

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,3,4,5,7,8]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 8

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [1,2,3,4,5,6,7]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 9

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,4,5,6]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 10

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,2,3,4,5]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 11

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,1,2,4]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

### Iteration 12

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt = X_opt[:, [0,2,3]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

**SCORES**

**R-squared value --> 0.914**

**Adjusted R-squared value --> 0.912**

This score shows that the model performs better with the three features in X_opt. 

However, 

we do see that the p value for feature **x2** has exceeded the value we set for our p value. We will chuck that feature and see if our model performs better.

### Iteration 13

In [None]:
# S0. Create a new set of features that will be our optimal set of features
X_opt_final = X_opt[:, [0,2]]

# S1. SL chosen 0.05

# S2. Taken X_opt. Fit multiple LR
regressor_OLS = sm.OLS(endog = y, exog = X_opt_final).fit()

# S3. predictor with highest p-value. p > SL
regressor_OLS.summary()

# S4. Remove predictor if p > SL and highest p-value. Go to S0.

**SCORES**

**R-squared value --> 0.913**

**Adjusted R-squared value --> 0.911**

This score shows that the model **FALLS** in performance when we **drop the x2 feature from X_opt.**

Thus, we will keep the **three features we selected in Iteration 12.**

In [None]:
print('Shape of optimal values or X: ', X_opt.shape)

# Let's visualise the first 3 values to see, which of the features have we selected
X_opt[0:5,:]

In [None]:
df.head(5)

On comparing the original dataset with the optimal table of X, we can see that the features that best give the salary prediction are:

1. Gender
2. UnderGraduate Degree (Comm/Sci or Others)
3. Percentage in MBA

## Final Model (Multiple Linear Regression)

In [None]:
X_final = df.iloc[:, [0,9,13]].values
y = df.iloc[:, 14].values

print('X_shape {}'.format(X_final.shape))
print('y_shape {}'.format(y.shape))

In [None]:
# Splitting
X_final_train, X_final_test, y_train, y_test = train_test_split(X_final, y, test_size=0.3, random_state=0)

print('Shape of training set: {} and test set: {}'.format(X_final_train.shape, X_final_test.shape))

In [None]:
# Making regressor
regressor = LinearRegression()
regressor.fit(X_final_train, y_train)

# Predicting test values
y_pred = regressor.predict(X_final_test)

# Model performance through metrics
print('Train Score: ', regressor.score(X_final_train, y_train))  
print('Test Score: ', regressor.score(X_final_test, y_test)) 
print()
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))
print('R2 score: ', r2_score(y_test, y_pred))

### Cross Validation

In [None]:
# cross validation
reg_score = cross_val_score(regressor, X_final_train, y_train, cv=10)

print('Cross Validation Scores across all 10 iterations: ', reg_score)
print('Multiple Linear Regression: ', np.mean(reg_score))

## Thoughts

The model here performs quite poorly even after choosing out the best features. That makes me wonder:

1. If the salary offered does really depend on the specifics of the student's biodata, or 
2. Does it depend on the company policies/existing salaries and posts in the company?
3. Does it depend in how the interview of the candidate was?

As visible, **none** of the features showed a very strong negative or positive correlation with the salary offered. This did imply that of the features given, **no particular feature** was a strong predictor for the salaries. 

### Further Work

Further work would include, 
1. Working on Random Forest model to see if that model gives better predictions or not.
2. Finding a better comination of variables that give us better salary predictions with RF model.

### Previous Work: Check out [here](http://www.kaggle.com/mani97/placed-or-not-eda-classification-88-8) for EDA and Classification modelling to predict if a student was placed or not!

### Note:

Do comment and let me know any ideas that I missed out on or if I could better my thought process regarding this in any way.

Thank you!