In [None]:
from pandas import DataFrame, read_csv, get_dummies
from scipy.stats import zscore
from statsmodels.stats.outliers_influence import variance_inflation_factor
from matplotlib.pyplot import figure, subplot2grid
from seaborn import set_theme,scatterplot,displot,barplot,countplot,heatmap
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
from numpy import where,abs,median,nan,sqrt
%matplotlib inline

In [None]:
set_theme(context="notebook",style='darkgrid', palette='inferno')

In [None]:
df = read_csv("Engineering_graduate_salary.csv")

In [None]:
df.info()

In [None]:
df.head()

-1 in the data represents that student has not attempted that section

For sake of convenience I will replace -1 with 0

So if a student has not attempted the section it will add nothing to the Salary

In [None]:
df.replace(to_replace=-1, value=0,inplace=True)

Now let's plot correlation matrix

In [None]:
figure(figsize=(16,16))
plot = heatmap(df.corr(), annot=True)
plot.set_title("Correlation Matrix", fontsize=18)

I will drop ID and DOB columns as they are not very relevant to the Salary

In [None]:
df.drop(columns=["ID", "DOB"], inplace=True)

I will map Gender to integers so that it can be used in the calculations

In [None]:
df["Gender"] = df["Gender"].replace({'m': 0, 'f': 1})

I will calculate [variance inflation factor](https://en.wikipedia.org/wiki/Variance_inflation_factor) which will help us identity if the there is [multicollinearity](https://en.wikipedia.org/wiki/Multicollinearity) in the data

In [None]:
X = df[['Gender', '10percentage', '12graduation', '12percentage', 'CollegeID', 'CollegeTier', 'collegeGPA', 'CollegeCityID', 'CollegeCityTier', 'GraduationYear', 'English', 'Logical', 'Quant', 'Domain', 'ComputerProgramming', 'ElectronicsAndSemicon', 'ComputerScience', 'MechanicalEngg', 'ElectricalEngg', 'TelecomEngg', 'CivilEngg', 'conscientiousness', 'agreeableness', 'extraversion', 'nueroticism', 'openess_to_experience', 'Salary']]

vif_data = DataFrame()
vif_data["Column"] = X.columns

vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

vif_data

In [None]:
plot = displot(df["Salary"], kde=True)

Our data looked very skewed, this can be because of Outliers (outliers are the points in the dataset which show high deviation from most of the data)

To make it a more normalised distribution we can remove Outliers

We can remove outliers using various algorithms and technique

Here I am going to use Z-Score (Z-Score is the how much times the standard deviation a point is deviated from the mean value.)

We will replace the values that are very largely deviated with the median value.

In [None]:
df["Salary"] = df["Salary"].replace([df["Salary"][(abs(zscore(df["Salary"])) > 3)]], median(df["Salary"]))

In [None]:
plot = displot(data=df["Salary"], kde=True)

This looks more like a normal distribution.

In [None]:
plot = scatterplot(x=df.Salary, y=df.Gender)
plot.set_title("Scatter Plot of Gender with Salary", fontsize=14)

In [None]:
plot = scatterplot(x=df['10percentage'], y=df['12percentage'])
plot.set_title("Scatter Plot of 10percentage with 12percentage", fontsize=14)

In [None]:
df["10percentage"].corr(df["12percentage"])

There is high correlation between 10 & 12 percentages

Also there is high multicollinearity as seen in the VIF values

So instead of using both, I will use only 12 percentage as 12 percentage is somewhat more important as compared to 10 percentage.

In [None]:
df.drop(columns=["10percentage"], inplace=True)

In [None]:
plot = scatterplot(x=df['CollegeID'], y=df['CollegeCityID'])
plot.set_title("Scatter Plot of CollegeID with CollegeCityID", fontsize=14)

Looks like CollegeID and CollegeCityID is same for every value

Also it had infinite VIF 

So it is neccessary to drop one of them

In [None]:
df.drop(columns=["CollegeID"], inplace=True)
df["CollegeCityID"].nunique()

In [None]:
plot = scatterplot(x=df['12graduation'], y=df['GraduationYear'])
plot.set_title("Scatter Plot of 12graduation with GraduationYear", fontsize=14)

There is a point with 0 graduation year

This can be a error in dataset
Let's remove it

In [None]:
df = df[df["GraduationYear"]> 1750]

In [None]:
df["GraduationYear"].corr(df["12graduation"])

There is high correlation between 10 & 12 percentages

Also there is high multicollinearity as seen in the VIF values

So again instead of using both, I will use only GraduationYear

In [None]:
df.drop(columns=["12graduation"], inplace=True)

In [None]:
figure(figsize=(16,8))
plot = scatterplot(x=df['12percentage'], y=df['collegeGPA'], hue=df.CollegeTier)
plot.set_title("Scatter Plot of 12percentage with collegeGPA", fontsize=14)

In [None]:
plot = scatterplot(x=df["collegeGPA"],y=df["Salary"])
plot.set_title("Scatter Plot of collegeGPA with Salary", fontsize=14)

I will manually remove the outleirs as they are easily identifiable in the plot

In [None]:
df = df[df["collegeGPA"] > 40]

In [None]:
figure(figsize=(8,24))
subplot2grid((3,1),(0,0))
scatterplot(x=df['English'], y=df['Salary']).set_title("English", fontsize=14)
subplot2grid((3,1),(1,0))
scatterplot(x=df['Logical'], y=df['Salary']).set_title("Logical", fontsize=14)
subplot2grid((3,1),(2,0))
scatterplot(x=df['Quant'], y=df['Salary']).set_title("Quant", fontsize=14)

In [None]:
plot = scatterplot(x=df['Domain'], y=df['Salary'])
plot.set_title("Domain", fontsize=14)

In [None]:
figure(figsize=(16,32))
subplot2grid((4,2),(0,0))
scatterplot(x=df['ComputerProgramming'], y=df['Salary']).set_title("ComputerProgramming", fontsize=14)
subplot2grid((4,2),(0,1))
scatterplot(x=df['ElectronicsAndSemicon'], y=df['Salary']).set_title("ElectronicsAndSemicon", fontsize=14)
subplot2grid((4,2),(1,0))
scatterplot(x=df['ComputerScience'], y=df['Salary']).set_title("ComputerScience", fontsize=14)
subplot2grid((4,2),(1,1))
scatterplot(x=df['MechanicalEngg'], y=df['Salary']).set_title("MechanicalEngg", fontsize=14)
subplot2grid((4,2),(2,0))
scatterplot(x=df['ElectricalEngg'], y=df['Salary']).set_title("ElectricalEngg", fontsize=14)
subplot2grid((4,2),(2,1))
scatterplot(x=df['TelecomEngg'], y=df['Salary']).set_title("TelecomEngg", fontsize=14)
subplot2grid((4,2),(3,0))
scatterplot(x=df['CivilEngg'], y=df['Salary']).set_title("CivilEngg", fontsize=14)

In [None]:
figure(figsize=(16,16))
plot= scatterplot(x=df["10board"],y=df["12board"])
plot.set_title("Scatter Plot of 10board with 12board", fontsize=14)

Though 10board and 12board are not very same but still I will remove 10board so that there is no risk of multicollinearity

In [None]:
df.drop(columns=["10board"], inplace=True)

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["12board"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of 12board", fontsize=14)

In [None]:
board = df["12board"].value_counts()

There are many boards which are very rarely choosen by students

So I will take major boards and categorize rest of them as other

In [None]:
rare_board = board[board <= 10]

In [None]:
def remove_rare_board(value):
    if value in rare_board:
        return 'other'
    else:
        return value
df["12board"] = df["12board"].apply(remove_rare_board)
df["12board"].value_counts()

Looks like there is another error as there is no '0' board
I will replace this '0' board with 'cbse' as it is the most common one

In [None]:
df["12board"].replace(to_replace='0',value='cbse', inplace=True)

I will convert categorical data to numeric data using get_dummies, so we can use these in our model

In [None]:
df = get_dummies(df, columns=["12board"], prefix="board_")

In [None]:
figure(figsize=(16,8))
plot = barplot(x=df["Degree"],y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Bar Plot of Degree with Salary", fontsize=14)

In [None]:
df["Degree"].value_counts()

Looks like majority of students belong to B.Tech./B.E. but still degree is very important factor in determining Salary

Again I will convert categorical data to numeric data using get_dummies, so we can use these in our model

In [None]:
df = get_dummies(df, columns=["Degree"], prefix="degree_")

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["Specialization"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of Specialization", fontsize=14)

In [None]:
specializations = df["Specialization"].value_counts()

I will categorize rarely chosen specializations into other

In [None]:
rare_specialization = specializations[specializations <= 10]

In [None]:
def remove_rare_specializations(value):
    if value in rare_specialization:
        return 'other'
    else:
        return value
df["Specialization"] = df["Specialization"].apply(remove_rare_specializations)
df["Specialization"].value_counts()

Now let's convert it to numeric so we can use it in our model

In [None]:
df = get_dummies(df, columns=["Specialization"], prefix="specialization_")

In [None]:
figure(figsize=(16,24))
subplot2grid((4,2),(0,0))
scatterplot(x=df['conscientiousness'], y=df['Salary']).set_title("conscientiousness", fontsize=14)
subplot2grid((4,2),(0,1))
scatterplot(x=df['agreeableness'], y=df['Salary']).set_title("agreeableness", fontsize=14)
subplot2grid((4,2),(1,0))
scatterplot(x=df['extraversion'], y=df['Salary']).set_title("extraversion", fontsize=14)
subplot2grid((4,2),(1,1))
scatterplot(x=df['nueroticism'], y=df['Salary']).set_title("nueroticism", fontsize=14)
subplot2grid((4,2),(2,0))
scatterplot(x=df['openess_to_experience'], y=df['Salary']).set_title("openess_to_experience", fontsize=14)

In [None]:
plot = scatterplot(x=df["CollegeTier"], y=df["CollegeCityTier"])
plot.set_title("Scatter Plot of CollegeTier with CollegeCityTier", fontsize=14)

Let's remove CollegeCityTier as it is not as relevant as CollegeTier

In [None]:
df.drop(columns=["CollegeCityTier"], inplace=True)

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["CollegeState"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of CollegeState", fontsize=14)

In [None]:
df["CollegeState"].value_counts()

In [None]:
df = get_dummies(df, columns=["CollegeState"], prefix="state_")

In [None]:
df.columns

I will use Ridge Regression as there is some multicollinearity

Ridge provides advantage in this case

In [None]:
data = df.copy()
X = data.drop(columns=["Salary"])
y = data[["Salary"]]
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.4,random_state=49, shuffle=True)

model = Ridge(alpha=0.5)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print(sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))