# Task for Today  

***

## Interview Success Prediction  

Given *data about resumes*, let's try to predict whether a candidate will **pass their interview** based on their resume.

We will use three models to make our predictions and PCA for dimensionality reduction to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv('../input/strategeion-resume-skills/resumes_development.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop the index column
    df = df.drop('Unnamed: 0', axis=1)
    
    # Split df into X and y
    y = df['Interview']
    X = df.drop('Interview', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test 

In [None]:
X_train, X_test, y_train, y_test  = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train.value_counts()

# Training/Results

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "      Random Forest": RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

# Training/Results With Dimensionality Reduction

In [None]:
n_components = 5

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])

In [None]:
X_train_reduced

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "      Random Forest": RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " Accuracy: {:.2f}%".format(model.score(X_test_reduced, y_test) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/BhlR-kHxc3E