# Multi-Class Text Classififcation Task

In [16]:
#imports cell 

#basic imports
import pandas as pd

#NLP imports
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS


#Classification imports
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

## Step 1: Data 
<ol>
    
    1.Loading Data into a dataframe.
    
    2.Understanding and Visualizing data.
    
    3.Cleaning and preprocessing data.
    
    4.Splitting data into test and train datasets.

In [17]:
#loading data into a dataframe.
dataframe = pd.read_csv("./Job titles and industries.csv")
dataframe.head()

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor - co...,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT


### Cleaning the data: Removing duplicates

In [18]:
cleanframe = dataframe.drop_duplicates(subset="job title")


### Cleaning the data: Text Pre-processing

In [19]:
def clean_text(text):
    """
        text: a string 
        return: modified clean string
    """
    result = ""
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) >= 2:
            token = token.lower() # lowercase text
            result+=token+" "     # append to result
    return result

cleanframe['job title'] = cleanframe['job title'].map(clean_text)
cleanframe.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleanframe['job title'] = cleanframe['job title'].map(clean_text)


Unnamed: 0,job title,industry
0,technical support helpdesk supervisor county b...,IT
1,senior technical support engineer,IT
2,head services,IT
3,js end engineer,IT
4,network telephony controller,IT


### Splitting data into test and train datasets + Dealing with class imbalance

In [20]:
X = cleanframe['job title']
y = cleanframe['industry']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
weights = compute_sample_weight("balanced",y_train)
# X_test.to_excel("./Xtest.xlsx")
# y_test.to_excel("./ytest.xlsx")

### Approach 2: Linear SVM

In [14]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])

sgd.fit(X_train, y_train, **{'clf__sample_weight': weights})

y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.8931140801644398


### Save the final selected Model to a file to use directly in the RESTfull API script.

In [15]:
from joblib import dump, load
dump(sgd, 'model.joblib')

['model.joblib']