In [None]:
from os import getcwd
from pandas import DataFrame, read_csv, get_dummies
from scipy.stats import zscore
from matplotlib.pyplot import figure
from seaborn import set_theme,barplot,countplot,heatmap,violinplot,boxplot
from numpy import abs,median,around
%matplotlib inline

In [None]:
set_theme(context="notebook",style='darkgrid', palette='inferno')

In [None]:
df = read_csv(getcwd()[:getcwd().find("individual_impacts")] + "Engineering_graduate_salary.csv")

In [None]:
df = df.filter(items=["Specialization","Salary"])

In [None]:
df["Salary"] = df["Salary"].replace([df["Salary"][(abs(zscore(df["Salary"])) > 3)]], median(df["Salary"]))

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["Specialization"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of Specialization", fontsize=14)

In [None]:
specializations = df["Specialization"].value_counts()
specializations

In [None]:
rare_specializations = specializations[specializations <= 10]
rare_specializations

In [None]:
def remove_rare_specialization(value):
    if value in rare_specializations:
        return 'other'
    else:
        return value
df["Specialization"] = df["Specialization"].apply(remove_rare_specialization)
df["Specialization"].value_counts()

In [None]:
figure(figsize=(16,8))
plot = countplot(x=df["Specialization"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Count Plot of Specialization", fontsize=14)

In [None]:
figure(figsize=(16,8))
plot = violinplot(x=df["Specialization"], y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Violin Plot of Specialization with Salary", fontsize=14)

In [None]:
figure(figsize=(16,8))
plot = barplot(x=df["Specialization"], y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Bar Plot of Specialization with Salary", fontsize=14)

In [None]:
figure(figsize=(16,8))
plot = boxplot(x=df["Specialization"], y=df["Salary"])
plot.set_xticklabels(plot.get_xticklabels(),rotation="vertical")
plot.set_title("Box Plot of Specialization with Salary", fontsize=14)

In [None]:
df = get_dummies(df, columns=["Specialization"], prefix="specialization")

In [None]:
figure(figsize=(16,16))
plot = heatmap(df.corr(), annot=True)
plot.set_title("Correlation Matrix", fontsize=18)

In [None]:
specializations = df.columns.tolist()
specializations.remove('Salary')
specializations

In [None]:
corelations = {}

for specialization in specializations:
    corelations[specialization] = df[specialization].corr(df["Salary"])

frame = DataFrame({
    "Specialization": corelations.keys(),
    "Corr": corelations.values()
})

frame

In [None]:
dic = {}

for specialization in specializations:
    selected = df[df[specialization] == 1]
    
    students_in_specialization = selected.shape[0]

    to_3 = 0
    to_6 = 0
    to_inf = 0
    for salary in selected['Salary']:
        if (salary < 300000) :
            to_3 = to_3 + 1
        elif (salary >= 300000  and salary < 600000):
            to_6 = to_6 + 1
        else:
            to_inf = to_inf + 1
    
    dic[specialization] = {
        "0 to 300000": around(abs(to_3 / students_in_specialization), decimals=1),
        "300000 to 600000": around(abs(to_6 / students_in_specialization), decimals=1),
        "600000 or more": around(abs(to_inf / students_in_specialization), decimals=1)
        }

package_frame = DataFrame(dic)

package_frame

# From above data and analysis we can deduce 

### ***6 out of 10*** Computer Engineering students have recieved packages between ***3 - 6 lakh***

### Computer Engineering students averagely recieved highest packages
- 1 out of 10 students recieved packages above ***6 lakh***
- 6 out of 10 students recieved packages between ***3 - 6 lakh***
- 3 out of 10 students recieved packages upto ***3 lakh***