<a href="https://colab.research.google.com/github/rajeshkpandey/AWESOME-FER/blob/master/Job_Title_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
print(os.listdir("./"))

import pandas as pd
import numpy as np
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
df = pd.read_csv('train.csv')
df.head()
test_df = pd.read_csv('test.csv')
test_df.head()

In [None]:
df.shape
test_df.shape

In [None]:
df.tail()
test_df.tail()

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
#cleaning training dataset
for i in range(len(df)):
  df['Job Description'][i] = df['Job Description'][i].lower()
  df['Job Description'][i]="".join(char for char in df['Job Description'][i] if char not in string.punctuation)
  ps = PorterStemmer()
  wml = WordNetLemmatizer()
  df['Job Description'][i]=ps.stem(df['Job Description'][i])
  df['Job Description'][i]=wml.lemmatize(df['Job Description'][i])
  df['Job Description'][i]=df['Job Description'][i].split()
  filter_words = []
  Stopwords = set(stopwords.words('english'))
  for word in df['Job Description'][i]:
    if word not in Stopwords:
         filter_words.append(word)
  df['Job Description'][i] = filter_words 
  df['Job Description'][i] = " ".join(df['Job Description'][i])

#cleaning testing set
for i in range(len(test_df)):
  test_df['Job Description'][i] = test_df['Job Description'][i].lower()
  test_df['Job Description'][i]="".join(char for char in test_df['Job Description'][i] if char not in string.punctuation)
  ps = PorterStemmer()
  wml = WordNetLemmatizer()
  test_df['Job Description'][i]=ps.stem(test_df['Job Description'][i])
  test_df['Job Description'][i]=wml.lemmatize(test_df['Job Description'][i])
  test_df['Job Description'][i]=test_df['Job Description'][i].split()
  filter_words = []
  Stopwords = set(stopwords.words('english'))
  for word in test_df['Job Description'][i]:
    if word not in Stopwords:
         filter_words.append(word)
  test_df['Job Description'][i] = filter_words
  test_df['Job Description'][i] = " ".join(test_df['Job Description'][i])
 

In [None]:
# Create a new dataframe with two columns
df1 = df[['Job Description', 'Title']].copy()

# Remove missing values (NaN)
df1 = df1[pd.notnull(df1['Job Description'])]

# Renaming job description column for a simpler name
df1.columns = ['Job_Description', 'Title'] 

df1.shape

In [None]:
df1.tail()

In [None]:
# Percentage of Job Description with text
total = df1['Job_Description'].notnull().sum()
round((total/len(df)*100),1)

In [None]:
pd.DataFrame(df.Title.unique()).values

In [None]:
len(df.Title.unique())

In [None]:
df2 = df1.sample(20000, random_state=1).copy()

In [None]:
pd.DataFrame(df2.Title.unique())

In [None]:
# Create a new column 'category_id' with encoded categories 
df2['category_id'] = df2['Title'].factorize()[0]
category_id_df = df2[['Title', 'category_id']].drop_duplicates()


# Dictionaries for future use
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Title']].values)

# New dataframe
df2.head()

In [None]:
df2.tail()

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each Job Description into a vector
features = tfidf.fit_transform(df2.Job_Description).toarray()

labels = df2.category_id

print("Each of the %d job descriptions is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

In [None]:
X = df2['Job_Description'] # Collection of Job descriptions
y = df2['Title'] # Target or the labels we want to predict

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

In [None]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

In [None]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, 
                                                               labels, 
                                                               df2.index, test_size=0.25, 
                                                               random_state=1)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Classification report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred, 
                                    target_names= df2['Product'].unique()))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model = LinearSVC().fit(tfidf_vectorizer_vectors, y_train)

In [None]:
predictions=[]
for i in range(len(test_df)):
  predictions.append(model.predict(fitted_vectorizer.transform([test_df['Job Description'][i]])))

In [None]:
print(predictions)