In [None]:
## Import necessary library for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
## Load data in pandas df
df = pd.read_csv(r"../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv", index_col=0)
df.head(3)

### What column name refers to 

ssc_p:- Secondary Education(%) 10th Grade

ssc_b:- 10th Board of Education

hsc_p:- Higher Secondary Education(%) 12th Grade

hsc_b:- 12th Board of Education

hsc_s:- Specialization in Higher Secondary Education

degree_p:- Undergraduate (%)

degree_t:- Undergraduate degree type

workex:- Work experience

etest_p:- Placement test (%)

specialisation:- MBA specialisation

mba_p:- MBA (%)

status:- Hiring status    

In [None]:
## Check nan/missing values in each columns
df.isnull().sum()

In [None]:
## Check detail description about numeric variables 
df.describe()

In [None]:
## Check data type and shape of every column 
df.info()

# Explanatory Data Analysis

### Plot histogram for all numeric variables

In [None]:
df.hist(bins=50, figsize=(20,15))
plt.show()

### Find marginal probability through two way table

Marginal Probability is the probability of the occurence of the single event

In [None]:
## Marginal Probability beteween categorical variables status & specialisation
pd.crosstab(index = df['specialisation'], columns=df['status'], margins =True, normalize = True)

The marginal probability shows, out of 56% of Finance candidate 44% gets placed and in HR out of 44% only 24% gets placed that means the success ratio of MBA in Finance specialisation is more than the HR. Now find does work experience is a one of the factor of success for Finance candidate.

In [None]:
## Marginal Probability beteween categorical variables status & specialisation
pd.crosstab(index = df['specialisation'], columns=df['workex'], margins =True, normalize = 'columns')

Above margin probability table shows work experience plays an important role in success of MBA Finance candidate. 

In [None]:
## Marginal Probability beteween categorical variables status & workexperience (workex)
pd.crosstab(index = df['status'], columns=df['workex'], margins =True, normalize = 'columns')

According to campus data and above three marginal probability table shows having work experience is better to get placed. That means work experience factor influenced a candidate in getting placed thats why the of getting placed in a company is better for MBA Finance caandidate.

In [None]:
## Marginal Probability beteween categorical variables status & specialisation
pd.crosstab(index = df['degree_t'], columns=df['status'], margins =True, normalize =True)

Commerce candidates getting more placed in companies but they are more in numbers as well but Science & Technology candiates have best ratio of getting placed from other undergraduate degree type.

#### According to college placement data & above pairplot clearly shows that percentage matters in placement of students in a XYZ campus.

### Which degree specialization is much demanded by corporate?

In [None]:
sns.countplot(x="status", data=df, hue='specialisation')
plt.title("Degree Specialization vs Candidate Placement")
plt.xlabel("Status of Placement")
plt.ylabel("Number of candidate")
plt.show()

#### Above plot shows Mkt&Fin specialization is dominating in campus, for placement.

### Now, check correlation of percentage in degree & ssc with repect to specialisation.

In [None]:
sns.lmplot(x='ssc_p', y='degree_p', data= df, hue ='specialisation', legend=True, palette="Set2")
plt.show()

### Does work experience affects placement of a candidate?

In [None]:
sns.countplot(x="status", data=df, hue='workex')
plt.title("Candidate Work Experience in Placement")
plt.xlabel("Status of Placement")
plt.ylabel("Number of candidate")
plt.show()

#### Acc. to data, most of candidates who have work experience are placed in a company. That means work experience is an important factor of getting placement.

### Find which degree technology studied by candidate is placed more 

In [None]:
sns.countplot(x="degree_t", data=df, hue='status')
plt.title("Candidate degree Technology in Placement")
plt.xlabel("Degree Technology")
plt.ylabel("Number of candidate")
plt.show()

### Relationship between placement of a student to their percentage.

Pairwise Plot :- Used to plot relationship in a dataset

Creates scatter plots for join relationship and histogram for univariate distributions 


In [None]:
sns.pairplot(df.drop(["salary"], axis=1), kind="scatter", hue="status")
plt.show()

### Check average or mean of numeric variable for unique values of status 

In [None]:
df.groupby(["status"]).mean()

#### Check how many students have been placed or not

In [None]:
df.status.value_counts()

### Create another dataframe of only placed students and analyse

In [None]:
## Create another dataframe of only placed students and check its shape 
df_placed = df.dropna(how="any")
df_placed.shape

#### Create sub dataset of only numeric data type variables

In [None]:
numeric_data = df_placed.select_dtypes(exclude = [object])
numeric_data.shape

### Salary distribution through box-whiskers plot & Histogram

In [None]:
## Box whiskers plot & histogram on the same window 
## Split the plotting window into 2 parts

f, (ax_box, ax_hist)= plt.subplots(2, gridspec_kw={"height_ratios": (.15, .85)})
## Add and create  box plot
sns.boxplot(df_placed["salary"], ax=ax_box)

sns.distplot(df_placed["salary"], ax=ax_hist, kde=False)
plt.show()

### Find correlation between numeric variables in dataframe

Correlation :- Calculate relationship between two numerical variables.

Excluding null valuees & excluding the categorical variables to find the Pearson's correlation

• Positive correlation – the other variable has a tendency to also increase 

• Negative correlation – the other variable has a tendency to decrease

• No correlation – the other variable does not tend to either increase or decrease.

In [None]:
corr_matrix = numeric_data.corr()
corr_matrix

Now, show the correlation matrix into heatmap for better understanding and visualization

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr_matrix,annot=True,cmap='YlGnBu')
plt.show()

###### According to correlation matrix and its visualization in heatmaps, there is almost -ve correlation between undergraduate(%) and salary. The salary variable only shows (+ve) correlation with MBA(%) & placement test(%) . And maximum positive coorelation will be shown between undergraduate(%) and MBA(%).  
##### In the heatmap of correlation matrix the darker the color of tile the correlation between the variables is highly positive.And lighter the color of tile the correlation between the variables is highly negative.

In [None]:
## Variance Infilation Factor 
# X = numeric_data.drop(columns=['salary','mba_p',], axis=1)
# vif = pd.DataFrame()
# vif["features"] = X.columns
# vif["vif_factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
# print(vif)

### Find does gender of an student affects the salary or any other variable

In [None]:
## Box and whiskers plot is very useful to find relationship between numerical & categorical variable.

sns.boxplot(x=df_placed['gender'], y = df_placed['salary'], hue='specialisation', data=df_placed)
plt.title("Salary vs Gender")
plt.xlabel("Gender of an candidate")
plt.xlabel("Salary of an candidate")
plt.show()

###### According to above box-whiskers plot the outliners or extreme value in salary column lie more in male category than female. One of assumptions is may be some of the male candidates getting hire for higher post in a company thats why they are getting more salary.

#### Pairplot of gender variable

In [None]:
sns.pairplot(df_placed, kind="scatter", hue="gender")
plt.show()

### Find relationship between degree specialization and other variables of placed student.

In [None]:
sns.pairplot(df_placed, kind="scatter", hue="specialisation")
plt.show()

### Data Pre Processing

In [None]:
df_predict = df_placed.copy()
df_predict.shape

In [None]:
## Drop unneccesary columns
df_predict.drop(columns=['status','degree_p'], axis=1, inplace=True)
df_predict.shape

In [None]:
## Converting categorical variables to dummy variables
##df_predict=pd.get_dummies(df_placed, drop_first=True)
df_predict = pd.get_dummies(df_predict, columns=['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation'], drop_first=True)
df_predict.shape

In [None]:
df_predict.head(3)

# Model Development :-  Salary Prediction

#### Now, build a Linear Regression and Random Forest Model on placed dataframe only.

In [None]:
## Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics

In [None]:
## Separating input and output features
x1 = df_predict.drop(['salary'], axis='columns', inplace=False)
y1 = df_predict['salary']

In [None]:
prices =pd.DataFrame({"1. Before":y1, "2. After":np.log(y1)})
prices.hist()
plt.show()

In [None]:
## Transform price as a logarithmic value
y1 =np.log(y1)

#### Splitting data into test and train to fit model & predict. Train set contains 75% data because test_size =0.25 and random state is a predefined algorithm its called pseudo random number generato

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.40, random_state = 5)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

## LINEAR REGRESSION WITH PLACED DATA

In [None]:
## Setting intercept as true
lgr = LinearRegression(fit_intercept =True)

In [None]:
## MODEL
model_lin1 = lgr.fit(x_train, y_train)

In [None]:
## Predicting model on test set
salary_predictions_lin1 = lgr.predict(x_test)

In [None]:
## Computing MSE and RMSE
lin_mse1 = mean_squared_error(y_test, salary_predictions_lin1)
lin_rmse1 = np.sqrt(lin_mse1)
print(lin_rmse1)

In [None]:
## R squared value
r2_lin_test1 = model_lin1.score(x_test, y_test)
r2_lin_train1 = model_lin1.score(x_train, y_train)
print(r2_lin_test1, r2_lin_train1)

In [None]:
## Regression diagnostics :- Resident plot analysis
## It is differnce test data and your prediction. It is just difference between actual & predicted value.
residuals1 = y_test - salary_predictions_lin1
sns.regplot(x = salary_predictions_lin1, y=residuals1, scatter=True, fit_reg=False, data=df_placed)
residuals1.describe()

In [None]:
#To retrieve the intercept:
print(model_lin1.intercept_)

#For retrieving the slope:
coeff_df = pd.DataFrame(model_lin1.coef_, x1.columns,columns=['Coefficient'])  
coeff_df

### Check the difference between the actual value and predicted value.

In [None]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted':salary_predictions_lin1})
df1.head(10)

### Now let's plot the comparison of Actual and Predicted values

In [None]:
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.show()

## Random Forest Regressor

In [None]:

## MODEL PARAMETERS
rf = RandomForestRegressor(n_estimators = 100, max_features='auto', max_depth=100, min_samples_split=10, min_samples_leaf=3, random_state=4)

In [None]:
## MODEL
model_rf1 =rf.fit(x_train, y_train)

In [None]:
## Predicting model on test set
salary_predictions_rf1 = rf.predict(x_test)

In [None]:
## Computing MSE and RSME
rf_mse1 = mean_squared_error(y_test, salary_predictions_rf1)
rf_rmse1 = np.sqrt(rf_mse1)
print(rf_rmse1)

In [None]:
## R Squared value
r2_rf_test1 = model_rf1.score(x_test, y_test)
r2_rf_train1 = model_rf1.score(x_train, y_train)
print(r2_rf_test1, r2_rf_train1)

### Check the difference between the actual value and predicted value.

In [None]:
df2 = pd.DataFrame({'Actual': y_test, 'Predicted':salary_predictions_rf1})
df2.head(10)

### Check the difference between the actual value and predicted value.

In [None]:
df2.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5')
plt.grid(which='minor', linestyle=':', linewidth='0.5')
plt.show()

# Model:- Placement Status Prediction

In [None]:
df_status = df.copy()
df.head(4)

In [None]:
df_status.drop('salary', axis=1, inplace = True)

X_features = ['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation']
encoded_df = pd.get_dummies(df_status[X_features], drop_first = True)

df_final = pd.concat([encoded_df, df_status], axis=1,sort=False)
df_final.columns

In [None]:
X = df_final.drop('status', axis=1)

X.drop(df_final[X_features], axis=1, inplace=True)

y = df_final['status']

## Supervised learning :- Gaussian Naive Bayes classifier
### Model validation via cross-validation :- Two Fold

In [None]:
from sklearn.model_selection import train_test_split

X1, X2, y1, y2 = train_test_split(X, y, test_size=0.3, random_state=5)  ## Divide data 
print(X1.shape, X2.shape, y1.shape, y2.shape)

In [None]:
from sklearn.naive_bayes import GaussianNB       # 1. choose model class
model = GaussianNB()                             # 2. instantiate model

In [None]:
## We do two validation trials, alternately using each half of the data as a holdout set.
y2_model = model.fit(X1, y1).predict(X2)
y1_model = model.fit(X2, y2).predict(X1)

In [None]:
## We will use the accuracy_score utility to see the fraction of predicted labels that match their true value:
from sklearn.metrics import accuracy_score

accuracy_score(y1, y1_model), accuracy_score(y2, y2_model)

### Visualization of five-fold cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
score1 = cross_val_score(model, X, y, cv=5)
score1

In [None]:
print('The mean score and standard deviation of model prediction is', (score1.mean(), score1.std() * 2))

## Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)
logmodel = LogisticRegression(max_iter=1000)

In [None]:
## Fit training data into model to make prediction further.
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

In [None]:
## Create confusion matrix to visualize accuracy of model
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,predictions)