# The following notebook shows the implementation of LinearSVC a part of scikit learns SVM package on the csv file that we created on the Resume Text 

## Background Details
### Working with resume data stored in .csv file job_desc
<ul>
    <li>reading the given data from csv file</li>
    <li>lemmatization and transformation of data</li>
    <li>splitting and vectorizing data</li>
</ul>

### Classification : LinearSVC
<ul>
    <li>Building a LinearSVC model</li>
    <li>Comparing LinearSVC with logistic regression on different metrics</li>
</ul>



In [None]:
#Importing important packages
import os
import sys

#Data Wrangling and manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Importing from the natural languange toolkit and the scikit lern Library
import re   #Importing the regular expression from the regex package
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [None]:
#Loading the data from the csv file

job_desc = pd.read_csv('C:/Users/User/Resume_Data_Science/job_desc.csv')

#Checking out a sample format of the loaded data
job_desc.sample(10)


### Lemmatization and Transformation of the given data 

In [None]:
#Defining the lemma and stopwords along with defining the instances for both of them
lemmatizer = WordNetLemmatizer()
stp_words = set(stopwords.words('english'))


#Lemmatizing and removing the stopwords
#Creating a separate column
job_desc['clean'] = job_desc['description'].apply(lambda x: " ".join([lemmatizer.lemmatize(i) for i in x.split() if i not in words]).lower())


#Removing the confusing titles i.e both the 'Data Scientist' and 'Data Analyst' from the title
job_desc['title_clean'] = job_desc['title'].map(lambda x: 1 if 'Data Scientist' in x and 'Data Analyst' in x else 0)
job_desc.drop(job_desc[job_desc.title_clean == 1].index, inplace = True)



"""
Defining - 
            0 : 'Data Analyst'
            1 : 'Data Scientist'

"""

job_desc['title_clean'] = job_desc['title'].map(lambda x: 1 if 'Data Scientist' in x else 0)

# print random sample to check the format
job_desc.sample(10)

### Splitting and Vectorizing the given data

In [None]:
#Loading all the required preprocessing and the fitting modules from the sklearn package
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer  #used for feature_extraction
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB       #used for Naive_Bayes
from sklearn.linear_model import SGDClassifier                               #used for Stochastic Gradient Descent
from sklearn.model_selection import LogisticRegression                       #used for Logistic Regression
from sklearn.svm import SVC, LinearSVC                                       #used for Support Vector Machine
from sklearn.pipeline import Pipeline                                        #used to create a pipeline of methodologies


In [None]:
#Building a particular Pipeline
text_clf = Pipeline([('vect', TfidfVectorizer(stop_words = 'english', ngram_range = (1, 3))),
                     ('clf', LinearSVC())
                    ])

#Splitting into training and testing data
x_train, x_test, y_train, y_test = train_test_split(job_desc['clean'], job_desc.title_clean, test_size = 0.2, random_state = 1)


#Fitting the data
text_clf.fit(x_train,y_train)

### Performing the LinearSVC on the job_desc with testing datasets

In [None]:
#Prediction using the training dataset
y_pred_classify = text_clf.predict(x_test)

#Print the classification report
#importing metrics from scikit learn
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred_classify))

In [None]:
#Printing out the Confusion Matrix
metrics.confusion_matrix(y_test,y_pred_classify)

In [None]:
#Getting the important feature names for SVM fitting of datasets
x_train_tokens = text_clf.named_steps['vect'].get_features_names()

#Finding the length of the train_tokens
len(x_train_tokens)

In [None]:
#Defining a dictionary to store feature and related importance of SVM
feature = {}                               #Empty Dictionary

#Defining a function to plot important features
def plot_coefficients_svm(classifier, feature_names, top_features = 30):
    #Getting the coefficients and store it to coef
    coef = classifier.coef_.ravel()
    
    #Returning the index value of top positive and negative parameters
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    
    
    # plot the graph
    plt.figure(figsize=(20, 10))
    colors = ['red' if c<0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2*top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1+2*top_features), feature_names[top_coefficients],rotation=60, ha='right')
    plt.show()                           #Displaying the plot
    
    
    #store highest correlated parameters to feature.
    for i in top_coefficients[:30]:
        feature[feature_names[i]] = -coef[i]
        


cv = text_clf.named_steps['vect']
svm = text_clf.named_steps['clf']

#Calling the function with the values
plot_coefficients_svm(svm, cv.get_feature_names())
 

# Making a cloudword

In [None]:
from wordcloud import WordCloud


wordcloud = WordCloud(width = 1000, height = 600)
plt.figure(figsize = (15,5))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.show()
