In [None]:
## On ‘Wuzzuf_Job_Postings_Sample.csv’, the following has been done:
## I used OpenRefine to:
## Remove all the rows with missing values in job_description, job_requirements, 
##         payment_period, currency (21850 rows to 19112 rows)
## Clustered all the similar cities in city, but kept certain data that were dates as is - 
##        “10th of Ramadan”, “6th October”, etc., as it is uninformative on which 
##        city it refers to
## the input data here is the cleaned dataset
## ----------------------------------------------------------------------------------------------
## I have also done a few pre-modeling Vizs with a couple of good insights which you can
## check out on: 
## https://public.tableau.com/profile/pooja.vijaykumar#!/vizhome/WuzzufJobPostingsDataset/Story1
## ----------------------------------------------------------------------------------------------
## A Classifier can be created that classifies each application based on job_category1 so that 
## when a person enters their qualifications, max/min salary expectations, their skills, etc., 
## their application falls into the respective category such as Engineering, IT, Marketing, 
## HR, Banking, Retail, Fashion, Journalism etc., 
## Features: salary min/max, payment_period, currency, num_vacancies, career_level
## Target: job_category1


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline

In [None]:
print(os.listdir('../input'))

In [None]:
data1 = pd.read_csv('../input/cleaned-wuzzuf-job-posts-sample/Cleaned_Wuzzuf_Job_Posts_Sample-csv.csv')

In [None]:
data2 = pd.read_csv('../input/wuzzuf-job-posts/Wuzzuf_Applications_Sample.csv')

In [None]:
data1.head()

In [None]:
data1.info()

In [None]:
## check for missing values
data1.isnull().sum()

In [None]:
## remove record with the single missing value
data1 = data1.dropna(axis=0)

In [None]:
data2.head()

In [None]:
data2.info()

### Saving data into another variable so as not to modify the original dataset: (Rather than dropping features from the original dataset, it's sometimes better to take all the necessary features and save it into another variable)

In [None]:
data11 = data1[['job_category1','job_industry1','salary_minimum','salary_maximum','num_vacancies','career_level','payment_period','currency']]

In [None]:
data11.head(3)

### One-hot encoding of all the object type columns:

In [None]:
data11 = data11.join(pd.get_dummies(data11['career_level'],prefix='CareerLevel'))

In [None]:
data11 = data11.join(pd.get_dummies(data11['payment_period'],prefix='paymentperiod'))

In [None]:
data11 = data11.join(pd.get_dummies(data11['currency'],prefix='currency'))

### Removing certain columns to avoid Multicollinearity:

In [None]:
data11 = data11.drop(['job_industry1','career_level','payment_period','currency'],axis=1)

In [None]:
data11.head()

In [None]:
x = data11.drop('job_category1',axis=1)

In [None]:
y = data11['job_category1']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=123)

### Decision Tree Classifier:

In [None]:
DTmodel = DecisionTreeClassifier(max_depth=4).fit(xtrain,ytrain)

In [None]:
DTpred = DTmodel.predict(xtest)

In [None]:
DTpred[:5,]

In [None]:
ytest[:5,]

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(ytest,DTpred)

In [None]:
## Run this if you want to observe the Decision Tree diagram
## useful to observe the feature split, gini index, etc

#from sklearn.tree import export_graphviz
#import graphviz

#dot_data = export_graphviz(DTmodel, filled=True, rounded=True, feature_names=xtrain.columns, out_file=None)
#graphviz.Source(dot_data)

### Decision Tree Classifier with GridSearchCV:

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param = [{"max_depth":[5,6,7,8,9,10,11,12,13,14, None], "max_features":[7,10,11,12,13,14,15,16]}]

In [None]:
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=321),param_grid=param,cv=2)

In [None]:
gs.fit(xtrain,ytrain)

In [None]:
gs.best_params_

In [None]:
pred = gs.predict(xtest)

In [None]:
accuracy_score(ytest,pred)

### Accuracy score has increased from 0.34 to 0.36 using GridSearchCV

### Random Forest Classifier: (because it's always better to compare algorithms)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFmodel = RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [None]:
RFmodel.fit(xtrain, ytrain)

In [None]:
RFpred=RFmodel.predict(xtest)

In [None]:
accuracy_score(ytest,RFpred)

### Random Forest shows higher accuracy score when compared to Decision Tree

### I've tried what I could with what I've learnt so far in this field. Please do let me know if there's anything else that can be done. The fun is that there are thousands of opportunities that come with a dataset like this! Different models can be built for different intentions so go crazy!
    