## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('dark_background')

In [None]:
df = pd.read_csv('../input/engineering-graduate-salary-prediction/Engineering_graduate_salary.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

## Dropping Few Columns 

In [None]:
df.drop(['ID', 'DOB', 'CollegeID', '12graduation' ,'10board', '12board' , 'CollegeState'
                                     ,'CollegeCityID', 'CollegeCityTier'
                                         ,'GraduationYear'], axis = 1, inplace = True)
df.head()

In [None]:
df = df.drop_duplicates()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.Degree.unique()

## Working on Specialization Column

In [None]:
df.Specialization.unique()

In [None]:
df.Specialization.value_counts()

In [None]:
specialization = df.Specialization.value_counts(ascending = False)
specialization

In [None]:
specializationlessthan10 = specialization[specialization<=10]
specializationlessthan10

In [None]:
def removespeciallessthan10(value):
    if value in specializationlessthan10:
        return 'other'
    else:
        return value
df.Specialization = df.Specialization.apply(removespeciallessthan10)
df.Specialization.unique()

In [None]:
df.head()

## Working on GPA Column

In [None]:
plt.scatter(df.index, df['collegeGPA'])

In [None]:
df = df[(df['collegeGPA']>40)]
df.shape

## Visualizing English, Logical and Quant

In [None]:
plt.figure(figsize = (10,10))
plt.subplot(2,2,1)
plt.scatter(df.index, df.English)
plt.title('English')

plt.subplot(2,2,2)
plt.scatter(df.index, df.Logical)
plt.title('Logical')

plt.subplot(2,2,3)
plt.scatter(df.index, df.Quant)
plt.title('Quant')

plt.show()

## Visualizing Few Other Columns

In [None]:
df.head()

In [None]:
df.columns

In [None]:
plt.figure(figsize = (10,10))
plt.subplot(2,2,1)
plt.scatter(df.index, df.Domain)
plt.title('Domain')

plt.subplot(2,2,2)
plt.scatter(df.index, df.ComputerProgramming)
plt.title('ComputerProgramming')

plt.subplot(2,2,3)
plt.scatter(df.index, df.ElectronicsAndSemicon)
plt.title('ElectronicsAndSemicon')

plt.show()

## Replacing -1 with Null

In [None]:
df = df.replace(-1, np.nan)

## Filling Null Values with Mean

In [None]:
cols_with_nan = [column for column in df.columns if df.isna().sum()[column] > 0]
for column in cols_with_nan:
    df[column] = df[column].fillna(df[column].mean())

# **Data Visualization**

In [None]:
df.head()

In [None]:
sns.countplot(df.Gender, palette = 'inferno')

In [None]:
sns.scatterplot(df['10percentage'], df['12percentage'])

In [None]:
sns.scatterplot(df['10percentage'], df['12percentage'], hue = df.CollegeTier)

**10 Percentage can be dropped as 12 Percentage is related to it, so let's have only one**

In [None]:
df.drop(['10percentage'], axis = 1, inplace = True)

In [None]:
plt.figure(figsize = (10,6))
sns.countplot(df['Specialization'])
plt.xticks(rotation = 90)

In [None]:
df.head()

## **Plotting GPA vs Salary**

In [None]:
plt.figure(figsize = (15,8))
sns.scatterplot(df.collegeGPA, df.Salary, hue = df.Specialization, palette = 'inferno')

In [None]:
plt.figure(figsize = (15,8))
sns.scatterplot(df.collegeGPA, df.Salary, hue = df.Degree, palette = 'inferno')

## Openness to Experience vs Salary

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(df.openess_to_experience, df.Salary)

## Domain vs Salary

In [None]:
plt.figure(figsize = (12,6))
sns.scatterplot(df.Domain, df.Salary, hue = df.Degree, palette = 'inferno')

In [None]:
plt.figure(figsize = (8,5))
sns.heatmap(df.corr())

In [None]:
# sns.pairplot(df)

## **Data Preprocessing**

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df.Gender = le.fit_transform(df.Gender)
df.Degree = le.fit_transform(df.Degree)
df.Specialization = le.fit_transform(df.Specialization)

df.head()

## **Creating XGBoost Model**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

In [None]:
x = df.drop('Salary', axis=1)
y = df['Salary']

sc = StandardScaler()
x = sc.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=0)

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

## **Predicting X_test Data**

In [None]:
predictions = xgb.predict(X_test)

In [None]:
diff = y_test - predictions

## **Visualizing Difference**

In [None]:
plt.figure(figsize = (12,6))
sns.distplot(diff)

## **Checking R^2 Score**

In [None]:
xgb_r2_score = xgb.score(X_test, y_test)

print("XGBoost R2 Score: ", (xgb_r2_score))