# People Dimension Model Classification

In [None]:
# Import starting libraries
import os
import pandas as pd
import numpy as np
import pickle
import nltk 

# Set directory
os.chdir("/Users/Sam Edds/datascience-challenge/data")

In [None]:
# Unpickle file
labeled = pd.read_pickle("labelled_dataset.pickle")

# Check out data
labeled.info()
labeled[:5]

# Initial Cleaning

In [None]:
# Check for and remove duplicates
labeled = labeled.drop_duplicates(subset = ['text','labelmax'])
len(labeled)

In [None]:
import re

### Clean text ###

# Removing stop words
stopwords = nltk.corpus.stopwords.words('english')

# Get rid of extras, split, and join for text
def preprocess(x):
    x = re.sub('[^a-z\s]', '',x.lower())                   
    x = [w for w in x.split() if w not in stopwords]       
    return ' '.join(x)  

# Call
labeled['text_clean'] = labeled['text'].apply(preprocess)


## Label checks

In [None]:
# Keep only obs with text
labeled = labeled[labeled['text_clean'] != '']
len(labeled)

In [None]:
# Check the labels
labeled.groupby(['labelmax'])['text'].agg(['count'])

# Remove null
labeled = labeled[labeled['labelmax'] != 'null']

## Split pros and cons

In [None]:
# Split into pros and cons
pros_cons = [re.split(r"con{1}s\s", w) for w in  labeled['text_clean']]

# Check if there are just pros or just cons
for review in pros_cons:
    if len(review) == 1:
        print(review)
        # Since just pros add '' for cons
        review.append('')

# Create new variables
labeled['pros'] = [w[0] for w in pros_cons]
labeled['cons'] = [w[1] for w in pros_cons]

# Support Vector Classification

In [None]:
# Make labels numeric 
labeled.labelmax = pd.Categorical(labeled.labelmax)
labeled['label'] = labeled.labelmax.cat.codes

## N Grams with TF-IDF

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

# Compute n grams from a dataframe for a given variable
class Ngrams(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        # Save name of variable to analyze
        name = df.columns
        # Initiate TfidfVectorizer
        vectorizer = TfidfVectorizer(strip_accents = 'unicode', use_idf = True, \
                                     stop_words = 'english', analyzer = 'word', \
                                     ngram_range = (1, 2), max_features = 50)
        # Fit to data
        x_train = vectorizer.fit_transform(df[name[0]].values)
        # Return sparse matrix
        return x_train
    
    def fit(self, df, y=None):
        # Unless error returns self
        return self

## Punctuation

In [None]:
# Check for punctuation that may be an indicator of strong feelings about time at company
class Punctuation(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass
    
    def transform(self, df):
        # Lots of exclamations
        df_new = df[['text']].copy()
        df_new['exclaim_many'] = df_new['text'].str.contains('[(!)]+', regex=True)*1

        # Lots of punctuation
        df_new['exclaim_q'] = df_new['text'].str.contains('[(!?)]', regex=True)*1

        # Caps (need more than 10 letters) extract, expand if want to checks
        df_new['caps'] = df_new['text'].str.contains('([A-Z]{10})', regex=True)*1
        
        # Drop text
        df_new = df_new.drop(columns = ['text'], axis = 1)

        return df_new
    
    def fit(self, df, y=None):
        # Unless error returns self
        return self

## Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

### Positive, negative, neutral sentiment analysis for a given variable ###

class SentimentAnalysis(BaseEstimator, TransformerMixin):

    def __init__(self, df):
        pass

    def transform(self, df):
        # Initialize
        sent = SIA()
        results = []
        # Variable name to compute scores on
        name = df.columns
        # Make into list
        sent_list = df[name[0]].tolist()
        # Compute polarity score for each review and add onto df
        for review in sent_list:
            pol_score = sent.polarity_scores(review)
            results.append(pol_score)
        # Make into a pandas df
        df_new = pd.DataFrame.from_records(results)
        # Add suffix
        df_new = df_new.add_suffix(name)

        return df_new
    
    def fit(self, df, y=None):
        # Unless error returns self
        return self


## Combine and run model

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion

### Pipe different features in with a name so the step can be later called for details ###

pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram_all', Ngrams(labeled[['text_clean']])),
        # Sentiment
       ('sent_pros', SentimentAnalysis(labeled[['pros']])),
        ('sent_cons', SentimentAnalysis(labeled[['cons']])),
        # Punctuation
       ('punc', Punctuation(labeled)),

    ])),
     # Classifier
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=52,
                           max_iter=10, tol=10)),])


# Cross validation and tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'clf__alpha': (1e1, 1e3, 1e-5),
            'clf__max_iter': (20, 30),
}


# Find best model
grid_search = GridSearchCV(pipeline, param_grid, cv=5, iid=False, n_jobs=-1, refit = True)
grid_search.fit(labeled, labeled['label'])

# Print it
print(grid_search.best_score_)
print(grid_search.best_params_)



In [None]:
# Run pipeline on best model
pipeline = Pipeline([
    ('feats', FeatureUnion([
        # Ngrams
        ('ngram_all', Ngrams(labeled[['text_clean']])),
        # Sentiment
       ('sent_pros', SentimentAnalysis(labeled[['pros']])),
        ('sent_cons', SentimentAnalysis(labeled[['cons']])),
        # Punctuation
       ('punc', Punctuation(labeled))
    ])),
    # Classifier
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=grid_search.best_params_['clf__alpha'], random_state=52,
                           max_iter=grid_search.best_params_['clf__max_iter'], tol=10)),])

# Run
pipeline.fit(labeled, labeled['label'])


## Set up Unlabeled

In [None]:
import json

### Read in unlabeled data and make into a list of dictionaries ###

# Initialize
all_reviews = []
# Set file location
for file in os.listdir("/Users/Sam Edds/datascience-challenge/data/unlabelled-dataset/"):
    full_filename = "%s/%s" % ("/Users/Sam Edds/datascience-challenge/data/unlabelled-dataset/", file)
    # append each set of reviews to a list
    with open(full_filename,'r') as indv_review:
        dict = json.load(indv_review)
        all_reviews.append(dict)
    

In [None]:
### Make into a dataframe ###

# Initialize df
unlabeled = pd.DataFrame(all_reviews[0])
# Count to keep track of the company...this could be used in the future
unlabeled['n'] = 0

# For each company after make a separate dataframe to append
for i in range(1,len(all_reviews)):
    init_df = pd.DataFrame(all_reviews[i])
    init_df['n'] = i
    unlabeled = unlabeled.append(init_df)

In [None]:
### Clean to match test file ###

# Since I didn't use create advice or have title or rating remove these
unlabeled = unlabeled.drop(columns = ['advice', 'rating', 'title'], axis = 1)
unlabeled = unlabeled.reset_index(drop = True)

# Clean text using preprocesser from beginning of code
unlabeled['text_clean'] = unlabeled['text'].apply(preprocess)


## Predict Out-Of-Sample

In [None]:
# Predict labels
pred_unlabeled = pipeline.predict(unlabeled)   

# Add to df
unlabeled['label'] = pred_unlabeled

In [None]:
# Check out the labels 
print(labeled.labelmax.unique())
print(labeled.label.unique())

# Add them
# Add corresponding category
unlabeled['category'] = np.where(unlabeled['label'] == 0, "adaptability",
                                np.where(unlabeled['label'] == 1, "collaboration",
                                        np.where(unlabeled['label'] == 2, "customer",
                                            np.where(unlabeled['label'] == 3, "detail",
                                                    np.where(unlabeled['label'] == 4, "integrity","result")))))

# Output to csv
unlabeled.to_csv("unlabeled_review_predictions_SE.csv")

In [None]:
# Look at how we overpredict likely on 2 and 5
unlabeled.groupby(['category'])['label'].agg(['count'])