# 0 library import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score,recall_score,precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

# 1 load and read data



## 1.1 Load data

In [None]:
df = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
del(df['sl_no'])
df.head(10)

-----------------------------------
## 1.2 Data reading

In [None]:
df.dtypes

### 1.2.1 Variable description


### Descriptive statistics table


In [None]:
df.describe(include='all')

### Analysing missing values

In [None]:
df.isna().any()

In [None]:
Salary =df.salary
Salary = Salary.fillna(0) # Sust. NaN por 0

In [None]:
Status = df.status

In [None]:
sum(((Salary == 0) & (Status == 0)) | ((Salary != 0) & (Status != 0)))

The missing values are in the status variables and coincide with the alumns which has no work, so we can change this Nan values for zeros 

### Variable transformations

In [None]:
df['ssc_b']=df['ssc_b'].replace({'Others':1, 'Central':0})
df['hsc_b']=df['hsc_b'].replace({'Others':1, 'Central':0}) 
df['gender']=df['gender'].replace({'M':1, 'F':0})
df['status']=df['status'].replace({'Placed':1, 'Not Placed':0}) 
df['workex']=df['workex'].replace({'Yes':1, 'No':0}) 
df['hsc_s']=df['hsc_s'].replace({'Arts':1, 'Commerce':2, 'Science':3}) 
df['degree_t']=df['degree_t'].replace({'Comm&Mgmt':1, 'Sci&Tech':2, 'Others':3}) 
df['specialisation']=df['specialisation'].replace({'Mkt&Fin':0, 'Mkt&HR':1}) 

df['salary'].fillna(0,inplace=True)

df['salary_ints'] = pd.cut(df['salary'],
                          bins=[-0.01, 230000, 310000, np.inf],
                          labels = [1, 2, 3])

df['salary_ints']=df['salary_ints'].astype('int')
df.head(10)

In [None]:
np.unique(df.salary_ints)

In [None]:
df.describe()

Lets see the unique values for each variable

In [None]:
for col in df:
    print(df[col].name,df[col].unique())

Variables descrpition for the transformation

In [None]:
round(df.describe(include='all'),2)

Histogram for many variables

In [None]:
salary = df['salary']
salary=salary[salary != 0]

fig1=plt.figure()
plt.hist(salary, bins = (40), color = 'royalblue', cumulative = -1,rwidth=0.75, align ='left')
plt.title('Distribución acumulada del salario',fontsize=15)
plt.xlabel('salario',fontsize=12)
plt.ylabel('Frecuencia', fontsize=12)

fig1.savefig('Distribucón acumulada del salario.jpg')

In [None]:
fig2 = plt.figure()
plt.hist(df["ssc_p"],bins=20, color = 'royalblue',rwidth=0.8)
plt.title('10º curso', fontsize = 15)
plt.xlabel('puntuación obtenida', fontsize = 12)
plt.ylabel('Frecuencia', fontsize = 12)
fig2.savefig('Histograma 10º curso.jpg')

In [None]:
fig3 = plt.figure()
plt.hist(df["hsc_p"], bins = 20, color = 'royalblue', rwidth = 0.8)
plt.title('12º curso', fontsize = 15)
plt.xlabel('puntuación obtenida', fontsize = 12)
plt.ylabel('Frecuencia', fontsize = 12)
fig3.savefig('Histograma 12º curso.jpg')

In [None]:
fig4 = plt.figure()
plt.hist(df["degree_p"], bins = 15, color = 'royalblue', rwidth = 0.8)
plt.title('Graduado',fontsize = 15)
plt.xlabel('puntuación obtenida', fontsize = 12)
plt.ylabel('Frecuencia', fontsize = 12)
fig4.savefig('Histograma punt. Graduado.jpg')

In [None]:
fig5 = plt.figure()
plt.hist(df["hsc_p"], bins = 25, color = 'royalblue', rwidth = 0.8)
plt.title('Especialización', fontsize = 15)
plt.xlabel('puntuación obtenida', fontsize = 12)
plt.ylabel('Frecuencia', fontsize = 12)
fig5.savefig('Histograma empleabilidad.jpg')

In [None]:
plt.rcParams['axes.labelsize'] = 20

pie_gender = np.array([sum(df.gender), len(df)-sum(df.gender)])
pie_gender_labels = ["Mujer", "Hombre"]

pie_workex = np.array([sum(df.workex), len(df)-sum(df.workex)])
pie_workex_labels = ["No exp.", "Sí exp."]

pie_status = np.array([sum(df.status), len(df)-sum(df.status)])
pie_status_labels = ["No Trabaja", "Sí trabaja"]

fig6 = plt.figure(figsize=(15,15))

plt.subplot(1,3,1)
plt.pie(pie_gender, labels = pie_gender_labels, autopct='%1.1f%%', shadow = True,
        colors = ('royalblue','lightsteelblue'))

plt.subplot(1,3,2)
plt.pie(pie_workex, labels = pie_workex_labels, autopct='%1.1f%%', shadow = True,
        colors = ('royalblue','lightsteelblue'))

plt.subplot(1,3,3)
plt.pie(pie_status, labels = pie_status_labels, autopct='%1.1f%%', shadow = True,
        colors = ('royalblue','lightsteelblue'))



fig6.savefig('Gráficos circulares.jpg')

Correlation:

In [None]:
corr = df.iloc[:, 0:14].corr(); corr

In [None]:
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, 
               xticklabels=corr.columns.values,
               yticklabels=corr.columns.values,
           linewidths = 0.5, cmap='mako', mask = mask)

--------------------------
# 2linear regression (salary)





## 2.1 split dataset

In [None]:
df_reg = df 
df_reg.dropna(inplace=True) 

drop_vars = ['salary']
X_reg = df_reg.drop(drop_vars, axis=1) 
y_reg = df_reg.salary 

In [None]:
X_train_reg,X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size = 0.25, random_state = 1)

print('X: ',len(X_reg),'=',len(X_train_reg)+len(X_test_reg))
print('y: ',len(y_reg),'=',len(y_train_reg)+len(y_test_reg))

## 2.2 Training model

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train_reg,y_train_reg)

## 2.3 Prediction model and evaluation

In [None]:
# Vemos la predicción
y_pred_reg = regr.predict(X_test_reg)
print('R^2: %.2f' % r2_score(y_test_reg, y_pred_reg)) 

In [None]:
list(y_pred_reg)

We have negatives values, so the prediction could be better

--------------------------
# 3 KNN (status)

## 3.1 Split data

In [None]:
df_knn=df
drop_vars = ['status']
X_knn = df_knn.drop(drop_vars, axis=1) 
y_knn = df_knn.status 


X_train_knn,X_test_knn, y_train_knn, y_test_knn = train_test_split(X_knn, y_knn, test_size = 0.25, random_state = 1)

## 3.2 Train model

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_knn,y_train_knn)

## 3.3 Prediction model and evaluation

In [None]:
prediction = knn.predict(X_test_knn)

print("Accuracy:",accuracy_score(y_test_knn, prediction)*100)
print("Precision:",precision_score(y_test_knn, prediction)*100)
print("Recall:",recall_score(y_test_knn, prediction)*100)

We have selected too much variables, so we extract some variables

## 3.4 Refitting model

In [None]:
drop_vars = ['status','salary']
X_knn = df_knn.drop(drop_vars, axis=1) 
y_knn = df_knn.status 


X_train_knn,X_test_knn, y_train_knn, y_test_knn = train_test_split(X_knn, y_knn, test_size = 0.25, random_state = 1)

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_knn,y_train_knn)

In [None]:
prediction = knn.predict(X_test_knn)

print("Accuracy:",accuracy_score(y_test_knn, prediction)*100)
print("Precision:",precision_score(y_test_knn, prediction)*100)
print("Recall:",recall_score(y_test_knn, prediction)*100)

In [None]:
predictions_knn = knn.predict(X_test_knn)

mat = confusion_matrix(y_test_knn, predictions_knn)
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False,cmap='Blues')
plt.xlabel('True label')
plt.ylabel('predicted label');

In [None]:
knn.score(X_test_knn,y_test_knn)

---------------------
# 4  Decission tree (status)

## 4.1 Split data

In [None]:
df_dt=df
drop_vars = ['status']
X_dt = df_dt.drop(drop_vars, axis=1) 
y_dt = df_dt.status 
X_train_dt,X_test_dt, y_train_dt, y_test_dt = train_test_split(X_dt, y_dt, test_size = 0.25, random_state = 1)

In [None]:
len(y_test_dt)

## 4.2 Train model

In [None]:
tree = DecisionTreeClassifier().fit(X_train_dt, y_train_dt)

## 4.3 Prediction

In [None]:
predictions_tree = tree.predict(X_test_dt)

## 4.4 Evaluation

In [None]:
mat = confusion_matrix(y_test_dt, predictions_tree)
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False,cmap='Blues')
plt.xlabel('True label')
plt.ylabel('predicted label');

Again, we must drop some variables

In [None]:
drop_vars = ['status', 'salary','salary_ints']
X_dt = df_dt.drop(drop_vars, axis=1) 
y_dt = df_dt.status 
X_train_dt,X_test_dt, y_train_dt, y_test_dt = train_test_split(X_dt, y_dt, test_size = 0.25, random_state = 1)

In [None]:
tree = DecisionTreeClassifier().fit(X_train_dt, y_train_dt)

In [None]:
predictions_tree = tree.predict(X_test_dt)

In [None]:
tree.score(X_test_dt,y_test_dt)

In [None]:
mat = confusion_matrix(y_test_dt, predictions_tree)
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False,cmap='Blues')
plt.xlabel('True label')
plt.ylabel('predicted label');

-----------
# 5 K-means

## 5.1 select variables

In [None]:
df_km=df
df_km = df_km[df_km["salary"]!=0]
#drop_vars = ['etest_p']
#df_km = df_km.drop(drop_vars, axis=1) 

## 5.2 Fitting model

In [None]:
kmeans= KMeans(n_clusters=2, random_state=0)
clusters_df_km = kmeans.fit_predict(df_km.iloc[:,0:8])
clusters_df_km

In [None]:
df_km['cluster'] = clusters_df_km
df_km.head()

## 5.3 Graph

In [None]:
plt.scatter(df_km['ssc_p'],df_km['degree_p'], c=clusters_df_km, s=50, cmap='viridis')

# 6 Logistic regression (status)

## 6.1 Split data

In [None]:
df_logreg = df
drop_vars = ['status']
X_logreg = df_logreg.drop(drop_vars, axis=1) 
y_logreg = df_logreg.status 
X_train_logreg,X_test_logreg, y_train_logreg, y_test_logreg = train_test_split(X_logreg, y_logreg,
                                                                               test_size = 0.25, random_state = 1)

## 6.2 Train model

In [None]:
logreg = linear_model.LogisticRegression(max_iter = 1000)
logreg.fit(X_train_logreg,y_train_logreg)

## 6.3 Prediction

In [None]:
logreg_predict = logreg.predict(X_test_logreg)

## 6.4 Evaluation

In [None]:
logreg.score(X_test_logreg,y_test_logreg)

In [None]:
mat = confusion_matrix(y_test_logreg, logreg_predict)
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False,cmap='Blues')
plt.xlabel('True label')
plt.ylabel('predicted label');

Too much variables

## 6.5 Refitting model

In [None]:
df_logreg = df
drop_vars = ['status','salary', 'salary_ints']
X_logreg = df_logreg.drop(drop_vars, axis=1) 
y_logreg = df_logreg.status 
X_train_logreg,X_test_logreg, y_train_logreg, y_test_logreg = train_test_split(X_logreg, y_logreg,
                                                                               test_size = 0.25, random_state = 1)

In [None]:
logreg = linear_model.LogisticRegression(max_iter=1000)
logreg.fit(X_train_logreg,y_train_logreg)

In [None]:
logreg_predict = logreg.predict(X_test_logreg)

In [None]:
logreg.score(X_test_logreg,y_test_logreg)

In [None]:
mat = confusion_matrix(y_test_logreg, logreg_predict)
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False,cmap='Blues')
plt.xlabel('True label')
plt.ylabel('predicted label');

# 7 Random forest (salary)

## 7.1 Split data

In [None]:
df_rf=df
drop_vars = ['salary_ints','salary', 'status']
X_rf = df_rf.drop(drop_vars, axis=1) 
y_rf = df_rf.salary_ints 
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size = 0.5, random_state = 2)

## 7.2 Fitting model

In [None]:
rf=RandomForestClassifier(n_estimators=100, random_state=40, n_jobs=2)
rf.fit(X_train_rf,y_train_rf)

## 7.3 Prediction and evaluation

In [None]:
y_pred_rf = rf.predict(X_test_rf)
print("Accuracy:",accuracy_score(y_test_rf, y_pred_rf))

In [None]:
rf_predict = rf.predict(X_test_rf)

mat = confusion_matrix(y_test_rf, rf_predict)
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False,cmap='Blues')
plt.xlabel('True label')
plt.ylabel('predicted label');

In [None]:
print(np.unique(rf_predict),np.unique(df_rf.salary_ints))

We could create some dummies for to improve the models