In [None]:
from os import getcwd
from pandas import DataFrame, read_csv, get_dummies
from scipy.stats import zscore
from statsmodels.stats.outliers_influence import variance_inflation_factor
from matplotlib.pyplot import figure
from seaborn import set_theme,scatterplot,barplot,countplot,heatmap,violinplot,boxplot
from numpy import abs,median,around
%matplotlib inline

In [None]:
set_theme(context="notebook",style='darkgrid', palette='inferno')

In [None]:
df = read_csv(getcwd()[:getcwd().find("individual_impacts")] + "Engineering_graduate_salary.csv")

In [None]:
df = df.filter(items=["10board","12board","Salary"])

In [None]:
df["Salary"] = df["Salary"].replace([df["Salary"][(abs(zscore(df["Salary"])) > 3)]], median(df["Salary"]))

In [None]:
figure(figsize=(16,16))
plot= scatterplot(x=df["10board"],y=df["12board"])
plot.set(xticklabels=[],yticklabels=[])
plot.set_title("Scatter Plot of 10board with 12board", fontsize=14)

In [None]:
df[df["10board"] == df["12board"]].shape[0]

Majority of the students opted same board in 12

For further analysis I will drop 10 board

In [None]:
df.drop(columns=["10board"], inplace=True)

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["12board"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of 12board", fontsize=14)

In [None]:
board = df["12board"].value_counts()
board

Some boards are opted by very less students in this DataFrame

So I will classify rarely opted boards as other

In [None]:
rare_board = board[board <= 10]
rare_board

In [None]:
def remove_rare_board(value):
    if value in rare_board:
        return 'other'
    else:
        return value
df["12board"] = df["12board"].apply(remove_rare_board)
df["12board"].value_counts()

In [None]:
df = df[df["12board"] != '0']

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["12board"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of 12board", fontsize=14)

Majority of students opted CBSE board and state boards

In [None]:
figure(figsize=(16,8))
plot = violinplot(x=df["12board"], y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Bar Plot of 12board with Salary", fontsize=14)

Students belonging to ISC board received highest package

But averagely MP board students received higher packages

In [None]:
figure(figsize=(16,8))
plot = boxplot(x=df["12board"], y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Bar Plot of 12board with Salary", fontsize=14)

In [None]:
figure(figsize=(16,8))
plot = barplot(x=df["12board"], y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Bar Plot of 12board with Salary", fontsize=14)

In [None]:
df = get_dummies(df, columns=["12board"], prefix="board")

In [None]:
figure(figsize=(16,16))
plot = heatmap(df.corr(), annot=True)
plot.set_title("Correlation Matrix", fontsize=18)

In [None]:
boards = df.columns.tolist()
boards.remove('Salary')
boards

In [None]:
corelations = {}

for board in boards:
    corelations[board] = df[board].corr(df["Salary"])

frame = DataFrame({
    "Board": corelations.keys(),
    "Corr": corelations.values()
})

frame

In [None]:
dic = {}

for board in boards:
    selected = df[df[board] == 1]
    
    students_in_board = selected.shape[0]

    to_3 = 0
    to_6 = 0
    to_inf = 0
    for salary in selected['Salary']:
        if (salary < 300000) :
            to_3 = to_3 + 1
        elif (salary >= 300000  and salary < 600000):
            to_6 = to_6 + 1
        else:
            to_inf = to_inf + 1
    
    dic[board] = {
        "0 to 300000": around(abs(to_3 / students_in_board), decimals=1),
        "300000 to 600000": around(abs(to_6 / students_in_board), decimals=1),
        "600000 or more": around(abs(to_inf / students_in_board), decimals=1)
        }

package_frame = DataFrame(dic)

package_frame

# From above data and analysis we can deduce 

### ***8 out of 10*** ISC board students have recieved packages between ***3 - 6 lakh***

### MP board students averagely recieved highest packages
- 2 out of 10 students recieved packages above ***6 lakh***
- 6 out of 10 students recieved packages between ***3 - 6 lakh***