# importing standard libraries

In [130]:
import pandas as pd 
import sklearn
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import tree
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2

from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest

from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas as pd
import  xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn.model_selection import train_test_split

In [131]:
# Changing the path of data
import os 
os.chdir('C:/Uconn MSBA/studies/Kaggle/data and code/TMP_code')

# Creating usable Functions


In [132]:
def loadData(filePath='job-classifications.txt'):
    """
    Function will load the data from the given path into a dataframe
    
    Arguments:
    filePath -- path of the file 
    
    Return:
    data --  dataframe 
            
    """
    data = pd.read_table(filePath, index_col=None, engine='python')
    return data

In [133]:
def preProcessing(features):
       
    """
    Function will preprocess the titles 
    
    Arguments:
    features -- Series of string 
    
    Return:
    clean_titles -- Series of string titles
    clean_wordlist -- Series of list of titles
        
    """
    num_titles = features.size
    clean_wordlist = []
    clean_titles = []
    
    # Extracting unique stopwords from English language 
    stops = set(stopwords.words('english'))
    
    for i in range( 0, num_titles):
        #letters_only = re.sub("[^a-zA-Z]", " ", features[i]) 
        
        words = str(features[i]).lower().split()
        
        # Converting words into lower case for symmetry 
        words = [w.lower() for w in words if not w in stops]  
        
        # Appending preprocessed words in a list 
        clean_wordlist.append(words)
        
        # Converting list to string 
        clean_titles.append(" ".join(words))
    return clean_titles

In [134]:
def data_modification(data):
    
    """
    Function will do basicn EDA on the data 
    
    Arguments:
    data -- dataframe
    
    Return:
    data -- Preprocessed dataframe
            
    """
  
    # rename a column to remove space
    data = data.rename(columns={'Job Title': 'Job_Title'})
    
    # drop missing values as they are very few
    data = data.dropna(how='any',axis=0)
    
    # Reset index
    data = data.reset_index(drop=True)
    return data 

In [135]:
def split_data(data):
    
    """
    Function will Split the data in train and validation set
    
    Arguments:
    data -- dataframe

    Return:
    train_x,valid_x,train_y,valid_y -- traning and validation set
            
    """
        
    y = data['Category']
    X = data[['Job_Title','Class']]
    
    # stratified sampling as this is imbalanced dataset
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.5,stratify=y)
    return train_x,valid_x,train_y,valid_y

In [136]:
# create a count vectorizer object 
def create_countvector(train_x,valid_x):
    
    """
    Function will calculate frequency of every term(column) in every document(row)
    
    Arguments:
    train_x -- training dataset
    valid_x  - validation dataset 

    Return:
    xtrain_tfidf  - TF-IDF score on  training dataset
    xvalid_tfidf   - TF-IDF score on  validation dataset
            
    """     
    
    # create a count vectorizer object 
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    count_vect.fit(data['Job_Title'])
    
    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.transform(train_x['Job_Title'])
    xvalid_count =  count_vect.transform(valid_x['Job_Title'])
    return xtrain_count,xvalid_count

In [137]:
# word level tf-idf
def word_leve_tf_idf(train_x,valid_x):
    
    """
    Function will calculate TF-IDF scores of every term in different documnets 
    
    Arguments:
    train_x -- training dataset
    valid_x  - validation dataset
   
    Return:
    xtrain_tfidf  - TF-IDF score on  training dataset
    xvalid_tfidf   - TF-IDF score on  validation dataset
            
    """
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    tfidf_vect.fit(data['Job_Title'])
    xtrain_tfidf =  tfidf_vect.transform(train_x['Job_Title'])
    xvalid_tfidf =  tfidf_vect.transform(valid_x['Job_Title'])
    return xtrain_tfidf,xvalid_tfidf

In [138]:
# ngram level tf-idf 
def ngrm_level_tf_idf(train_x,valid_x):
    
    """
    Function will calculate TF-IDF scores of N_gram which are combination of n terms together 
    
    Arguments:
    train_x -- training dataset
    valid_x  - validation dataset

    Return:
    xtrain_tfidf_ngram  - TF-IDF score on  training dataset
    xvalid_tfidf_ngram   - TF-IDF score on  validation dataset
            
    """
    tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram.fit(data['Job_Title'])
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x['Job_Title'])
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x['Job_Title'])
    return xtrain_tfidf_ngram,xvalid_tfidf_ngram


In [139]:
# characters level tf-idf
def char_level_tf_idf(train_x,valid_x):
    
    """
    Function will calculate TF-IDF scores of character level n_gram 
    
    Arguments:
    train_x -- training dataset
    valid_x  - validation dataset
   
    Return:
    xtrain_tfidf  - TF-IDF score on  training dataset
    xvalid_tfidf   - TF-IDF score on  validation dataset
            
    """
        
    tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram_chars.fit(data['Job_Title'])
    xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x['Job_Title']) 
    xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x['Job_Title']) 
    return xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars

In [140]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    """
    Function will fit the training dataset on the classifier
    
    Arguments:
    classifier -- type of model to be used
    feature_vector_train  -- training data 
    label  -- dependent variable of training 
    feature_vector_valid  -- validation data set
    
    Return:
    np.mean(precision) --  mean of precision of all 3 classes ( Precision = TP/Predicted Positive )
    np.mean(recall)  --  mean of recall of all 3 classes   ( Recall = TP/Actual Positive )
    np.mean(fscore)  --  mean of fscore of all 3 classes
    accuracy   --  mean of accuracy of all 3 classes
            
    """
    
    # creating empty list for storing values
    precision = []
    recall = []
    fscore = []
    accuracy = 0
    
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
            
    p,r,f,s = precision_recall_fscore_support(predictions, valid_y, average='weighted')
    accuracy  = metrics.accuracy_score(predictions, valid_y)
    precision.append(p)
    recall.append(r)
    fscore.append(f)
    return np.mean(precision),np.mean(recall),np.mean(fscore),np.mean(accuracy)

# Calling Functions

In [141]:
# Load the dataset

data  = loadData()

In [142]:
# modifying data for removing null value 

data = data_modification(data)

In [143]:
# Loading the respective column in separate series for ease of use 

titles = data['Job_Title']
labels = data['Category']
dclass = data['Class']

In [144]:
# Preprocessig the titles

data['Job_Title'] = preProcessing(titles)

In [145]:
# Since this is imbalanced class classification problem so we will take only those categories which has atleast 2 values 

data['cnt'] = data.groupby(['Category'])['Job_Title'].transform('count')
data = data.drop(data[data.cnt==1].index)

In [146]:
# Splitting the data 

train_x, valid_x, train_y, valid_y=       split_data(data)

In [147]:
# Creating count vector of terms in Job_Title

xtrain_count,xvalid_count = create_countvector(train_x,valid_x)

In [148]:
# Creating word level Tf-tdf matrix

xtrain_tfidf,xvalid_tfidf =      word_leve_tf_idf(train_x,valid_x)

In [149]:
# Creating n_gram level Tf-tdf matrix

xtrain_tfidf_ngram,xvalid_tfidf_ngram = ngrm_level_tf_idf(train_x,valid_x)

In [150]:
# Creating char level level Tf-tdf matrix

xtrain_tfidf_ngram_chars,xvalid_tfidf_ngram_chars =char_level_tf_idf(train_x,valid_x)

# Naive Bayes

In [168]:
# Creating a dataframe for appaneding different type of validation parameter 

metrics_NB = pd.DataFrame({
                    'Model_Name':['Naive Bayes','Naive Bayes','Naive Bayes','Naive Bayes'],
                    'Type_vector':['Count','Word','N-Gram','Char'],
                    'Precision':[None,None,None,None],
                    'Recall':[None,None,None,None],
                    'F-Score':[None,None,None,None],
                    'Accuracy':[None,None,None,None]
                   
                   })

In [170]:
# Naive BAyes Algoorithm :Based in Bayes theorem 
# Assume independence of predictors 

# Naive Bayes on Count Vectors
precision, recall, fscore, accuracy  = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy,  )
metrics_NB.ix[0,'Precision']= precision 
metrics_NB.ix[0,'Recall']= recall 
metrics_NB.ix[0,'F-Score']= fscore 
metrics_NB.ix[0,'Accuracy']= accuracy 



# Naive Bayes on Word Level TF IDF Vectors
precision, recall, fscore, accuracy  = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF:: precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy,  )
metrics_NB.ix[1,'Precision']= precision 
metrics_NB.ix[1,'Recall']= recall 
metrics_NB.ix[1,'F-Score']= fscore 
metrics_NB.ix[1,'Accuracy']= accuracy 

# Naive Bayes on Ngram Level TF IDF Vectors
precision, recall, fscore, accuracy  = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors:  precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)
metrics_NB.ix[2,'Precision']= precision 
metrics_NB.ix[2,'Recall']= recall 
metrics_NB.ix[2,'F-Score']= fscore 
metrics_NB.ix[2,'Accuracy']= accuracy 


# Naive Bayes on Character Level TF IDF Vectors
precision, recall, fscore, accuracy  = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors:  precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)
metrics_NB.ix[3,'Precision']= precision 
metrics_NB.ix[3,'Recall']= recall 
metrics_NB.ix[3,'F-Score']= fscore 
metrics_NB.ix[3,'Accuracy']= accuracy 


  'recall', 'true', average, warn_for)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


NB, Count Vectors: precision  0.8104312268659407 recall 0.51935352437865 fscore 0.6003121566352744 accuracy 0.51935352437865
NB, WordLevel TF-IDF:: precision  0.8403168512339607 recall 0.49307347548553576 fscore 0.5830813230599864 accuracy 0.49307347548553576
NB, N-Gram Vectors:  precision  0.8092676368338143 recall 0.5290642401195165 fscore 0.5874727454642509 accuracy 0.5290642401195165
NB, CharLevel Vectors:  precision  0.816801986122359 recall 0.5103897867716963 fscore 0.5979850480506372 accuracy 0.5103897867716963


In [171]:
metrics_NB.head()

Unnamed: 0,Accuracy,F-Score,Model_Name,Precision,Recall,Type_vector
0,0.519354,0.600312,Naive Bayes,0.810431,0.519354,Count
1,0.493073,0.583081,Naive Bayes,0.840317,0.493073,Word
2,0.529064,0.587473,Naive Bayes,0.809268,0.529064,N-Gram
3,0.51039,0.597985,Naive Bayes,0.816802,0.51039,Char


# Linear Model 

In [172]:
# Creating a dataframe for appaneding different type of validation parameter 

metrics_LR = pd.DataFrame({
                    'Model_Name':['Linear Model','Linear Model','Linear Model','Linear Model'],
                    'Type_vector':['Count','Word','N-Gram','Char'],
                    'Precision':[None,None,None,None],
                    'Recall':[None,None,None,None],
                    'F-Score':[None,None,None,None],
                    'Accuracy':[None,None,None,None]
                   
                   })

In [174]:
# Using Linear CLassifier (Logistic Regression)
# It estimates probabilities using a logistic/sigmoid function

# # Linear Classifier on Count Vectors
precision, recall, fscore, accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)

metrics_LR.ix[0,'Precision']= precision 
metrics_LR.ix[0,'Recall']= recall 
metrics_LR.ix[0,'F-Score']= fscore 
metrics_LR.ix[0,'Accuracy']= accuracy 

# Linear Classifier on Word Level TF IDF Vectors
precision, recall, fscore, accurac = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF:: precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)
metrics_LR.ix[1,'Precision']= precision 
metrics_LR.ix[1,'Recall']= recall 
metrics_LR.ix[1,'F-Score']= fscore 
metrics_LR.ix[1,'Accuracy']= accuracy 


# Linear Classifier on Ngram Level TF IDF Vectors
precision, recall, fscore, accurac = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors:  precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)
metrics_LR.ix[2,'Precision']= precision 
metrics_LR.ix[2,'Recall']= recall 
metrics_LR.ix[2,'F-Score']= fscore 
metrics_LR.ix[2,'Accuracy']= accuracy 

# Linear Classifier on Character Level TF IDF Vectors
precision, recall, fscore, accurac = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors:  precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)
metrics_LR.ix[3,'Precision']= precision 
metrics_LR.ix[3,'Recall']= recall 
metrics_LR.ix[3,'F-Score']= fscore 
metrics_LR.ix[3,'Accuracy']= accuracy 

  'recall', 'true', average, warn_for)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


LR, Count Vectors: precision  0.7922052298880102 recall 0.6595137851419258 fscore 0.7014662094373432 accuracy 0.6595137851419258
LR, WordLevel TF-IDF:: precision  0.7986293640827916 recall 0.6308569876409073 fscore 0.684223382888476 accuracy 0.6595137851419258
LR, N-Gram Vectors:  precision  0.7774708626726773 recall 0.6117071845715062 fscore 0.6463358147894406 accuracy 0.6595137851419258
LR, CharLevel Vectors:  precision  0.7985138754451147 recall 0.639413282629363 fscore 0.6945817638390521 accuracy 0.6595137851419258


In [175]:
metrics_LR.head()

Unnamed: 0,Accuracy,F-Score,Model_Name,Precision,Recall,Type_vector
0,0.659514,0.701466,Linear Model,0.792205,0.659514,Count
1,0.659514,0.684223,Linear Model,0.798629,0.630857,Word
2,0.659514,0.646336,Linear Model,0.777471,0.611707,N-Gram
3,0.659514,0.694582,Linear Model,0.798514,0.639413,Char


# SVM model

In [176]:
# Creating a dataframe for appaneding different type of validation parameter 

metrics_SVM = pd.DataFrame({
                    'Model_Name':['SVM'],
                    'Type_vector':['Word'],
                    'Precision':[None],
                    'Recall':[None],
                    'F-Score':[None],
                    'Accuracy':[None]
                   
                   })

In [177]:
# SVM extractes best possible hyper plane that segregaets 2 classes

accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

metrics_SVM.ix[0,'Precision']= precision 
metrics_SVM.ix[0,'Recall']= recall 
metrics_SVM.ix[0,'F-Score']= fscore 
metrics_SVM.ix[0,'Accuracy']= accuracy 


SVM, N-Gram Vectors:  (1.0, 0.051948933858481594, 0.09876702601510555, 0.051948933858481594)


  'recall', 'true', average, warn_for)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


In [178]:
metrics_SVM.head()

Unnamed: 0,Accuracy,F-Score,Model_Name,Precision,Recall,Type_vector
0,"(1.0, 0.051948933858481594, 0.0987670260151055...",0.694582,SVM,0.798514,0.639413,Word


# Bagging mode

In [179]:
# Creating a dataframe for appaneding different type of validation parameter 

metrics_RF = pd.DataFrame({
                    'Model_Name':['Random Forest','Random Forest'],
                    'Type_vector':['Count','Word'],
                    'Precision':[None,None],
                    'Recall':[None,None],
                    'F-Score':[None,None],
                    'Accuracy':[None,None]
                   
                   })

In [180]:
# Random Forest Algorithm: Create multiple trees, subsample data and get scores
# Using Random forest with all default values 

accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)

metrics_RF.ix[0,'Precision']= precision 
metrics_RF.ix[0,'Recall']= recall 
metrics_RF.ix[0,'F-Score']= fscore 
metrics_RF.ix[0,'Accuracy']= accuracy 

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF:: precision ", precision, "recall",recall,"fscore",fscore,"accuracy",accuracy)

metrics_RF.ix[1,'Precision']= precision 
metrics_RF.ix[1,'Recall']= recall 
metrics_RF.ix[1,'F-Score']= fscore 
metrics_RF.ix[1,'Accuracy']= accuracy 



  'recall', 'true', average, warn_for)
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


RF, Count Vectors: precision  0.7985138754451147 recall 0.639413282629363 fscore 0.6945817638390521 accuracy (0.7786192863234669, 0.6862012766535379, 0.7096240415963905, 0.6862012766535379)
RF, WordLevel TF-IDF:: precision  0.7985138754451147 recall 0.639413282629363 fscore 0.6945817638390521 accuracy (0.7746231960497377, 0.6896645389107701, 0.7138196738389014, 0.6896645389107701)


  'recall', 'true', average, warn_for)


In [181]:
metrics_RF.head()

Unnamed: 0,Accuracy,F-Score,Model_Name,Precision,Recall,Type_vector
0,"(0.7786192863234669, 0.6862012766535379, 0.709...",0.694582,Random Forest,0.798514,0.639413,Count
1,"(0.7746231960497377, 0.6896645389107701, 0.713...",0.694582,Random Forest,0.798514,0.639413,Word


In [182]:
# ALthough we have calculated different metics , a good model should be the one which has good precisiona nd good recall. 
# This can be effectively measured by F-Score

In [183]:
ValidationScore = metrics_NB.append(metrics_LR,ignore_index = True)
ValidationScore = crossValidationScore.append(metrics_SVM,ignore_index = True)
ValidationScore = crossValidationScore.append(metrics_RF,ignore_index = True)

In [184]:
ValidationScore

Unnamed: 0,Accuracy,F-Score,Model_Name,Precision,Recall,Type_vector
0,0.519354,0.600312,Naive Bayes,0.810431,0.519354,Count
1,0.493073,0.583081,Naive Bayes,0.840317,0.493073,Word
2,0.529064,0.587473,Naive Bayes,0.809268,0.529064,N-Gram
3,0.51039,0.597985,Naive Bayes,0.816802,0.51039,Char
4,0.659514,0.701466,Linear Model,0.792205,0.659514,Count
5,0.659514,0.684223,Linear Model,0.798629,0.630857,Word
6,0.659514,0.646336,Linear Model,0.777471,0.611707,N-Gram
7,0.659514,0.694582,Linear Model,0.798514,0.639413,Char
8,"(1.0, 0.051948933858481594, 0.0987670260151055...",0.694582,SVM,0.798514,0.639413,Word
9,"(0.7767306904604047, 0.6864729050658699, 0.711...",0.694582,Random Forest,0.798514,0.639413,Count


# Selecting Best model

In [None]:
# For a good classification model,it is important to have good accuracy as well as good F Score 
# SO based on above scores , we can choose Random FOrest for this model 