In [1]:
##imports
import pandas as pd
import numpy as np
import re, string
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english')) 

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from textblob import TextBlob
from textblob import Word

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier



In [4]:
## Reading Input Files - Input
def read_files(filename):
    df = pd.read_csv(filename,header=None)
    df.columns = ['text','label']
    df.dropna(inplace=True)
    print('Total num of records ',len(df))
    return df

In [5]:
def rebalance_data(df):
    label_0_len = len(df[df['label']==0])
    label_1_len = len(df[df['label']==1])
    min_len = min(label_0_len,label_1_len)

    df_new_lab1 = df[df['label']==1].sample(min_len)
    df_new_lab0 = df[df['label']==0].sample(min_len)

    df=pd.concat([df_new_lab0,df_new_lab1])
    df.reset_index(inplace=True,drop=True)
    print('Total num of records after balancing ',len(df))
    return df

In [6]:
def clean_lemma_text_blob(x):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    ## Removing Stop Words, Lemmatizing using Text Blob
    #     x = x.lower() ## Lower Case Conversion
    x = x.strip()
    x = regex.sub('', x) ## Removing Punctuations
    word_tokens = x.split()
    filtered_sentence = [Word(i.lower()).lemmatize("v") for i in word_tokens if not i in stop_words] 
    filtered_sentence = ' '.join(filtered_sentence)
    if(len(filtered_sentence.split())>0):
        p_blob = TextBlob(filtered_sentence)
        m = np.array(p_blob.tags)
#         m = m[(m[:,1]=='NN') | (m[:,1] =='JJ') | (m[:,1] =='VBP') | (m[:,1] =='VB') ]
        m = m[~(m[:,1]=='NN')]
        filtered_sentence = ' '.join(m[:,0])
        filtered_sentence = filtered_sentence.lower()
        return filtered_sentence
    else:
        return ''
    

In [7]:
def clean_text_df(df):
    df['text_noun_cleaned']=df['text'].apply(lambda x: clean_lemma_text_blob(x))
    return df

In [8]:
def get_sentiment_features(df):
    df['polarity']=df['text'].apply(lambda x:TextBlob(x).sentiment.polarity)
    df['subjectivity']=df['text'].apply(lambda x:TextBlob(x).sentiment.subjectivity)
    return df

In [9]:
def data_transformation(df,tfidf_params = 50):
    ## Tweaking parameters
    vectorizer = TfidfVectorizer(min_df=tfidf_params,ngram_range=(1,2))
    X = vectorizer.fit_transform(df['text_noun_cleaned'])
    df_input = pd.DataFrame(X.toarray())
    df_input.columns = vectorizer.get_feature_names()
    if 'label' in df.columns:
        df_input['label'] = df['label']
    df_input['polarity'] = df['polarity']
    df_input['subjectivity'] = df['subjectivity']
    return df_input, vectorizer

In [10]:
def model_selection(df_input):
    results = {}
    l = list(df_input.columns)
    l.remove('label')
    X_train, X_test, y_train, y_test = train_test_split(df_input[l], df_input['label'],
                                                    test_size=0.33, random_state=42)
    
    ## BNB
    clf_bnb = BernoulliNB().fit(X_train, y_train)
    ypred=clf_bnb.predict(X_test)
    acc_BNB = accuracy_score(y_test,ypred)
    print("Accuracy for BNB model with/without tuning: ", acc_BNB)
    results[clf_bnb] = acc_BNB
    
    ## Random Forest
    clf_rf = RandomForestClassifier(random_state=42)
    param_grid = [{ 
        'n_estimators': np.arange(10,110,10),
        'max_depth' : [4,5,6,7,8]
    }]
    CV_rf = GridSearchCV(estimator=clf_rf, param_grid=param_grid, cv= 5)
    CV_rf.fit(X_train, y_train)
    y_pred = CV_rf.predict(X_test)
    acc_RF = accuracy_score(y_test,y_pred)
    print("Accuracy for Random Forest after CV: ", acc_RF)
    results[CV_rf] = acc_RF
    
    ## KNN
    clf_knn = KNeighborsClassifier()
    param_grid = [{ 
        'n_neighbors': np.arange(8,20)
    }]
    CV_knn= GridSearchCV(estimator=clf_knn, param_grid=param_grid, cv= 5)
    CV_knn.fit(X_train, y_train)
    y_pred = CV_knn.predict(X_test)
    acc_KNN = accuracy_score(y_test,y_pred)
    print("Accuracy for KNN after CV data: ",acc_KNN)
    results[CV_knn] = acc_KNN
    
    ## Voting Classifier
    eclf1 = VotingClassifier(estimators=[('nb', clf_bnb), ('rf', CV_rf), ('knn', CV_knn)], voting='hard')
    eclf1 = eclf1.fit(X_train, y_train)
    ypred = eclf1.predict(X_test)
    acc_VC = accuracy_score(y_test,ypred)
    print("Accuracy for voting classifier: ",acc_VC)
    results[eclf1] = acc_VC
    
    best_estimator = max(results, key=lambda k: results[k])
    return best_estimator
    

In [11]:
def train_pipeline(filename):
    ## Steps
    # 1.Reading files
    df = read_files(filename)
    # 2. Checking for imbalance in data
    df = rebalance_data(df)
    # 3. Text Cleaning 
    df = clean_text_df(df)
    # 4. Sentiment based features
    df = get_sentiment_features(df)
    # 5. Data Transformation
    df_input,vect = data_transformation(df)
    # 6. Model Selection 
    trained_classifier = model_selection(df_input)
    return trained_classifier, vect

In [12]:
def predict_pipeline(x,vect,clf):
    # Data Transformation - Preprocessing and the trained TFIDF vectoriser
    clean_text = clean_lemma_text_blob(x)
    print("Text After Preprocessing",clean_text)
    l = [clean_text]
    X = vect.transform(l)
    df_input = pd.DataFrame(X.toarray())
    df_input.columns = vect.get_feature_names()
    df_input['polarity'] = [TextBlob(x).sentiment.polarity]
    df_input['subjectivity'] = [TextBlob(x).sentiment.subjectivity]
    # Prediction with the trained classifier
    ypred_test=clf.predict(df_input)
    return ypred_test
    

In [13]:
%%time
# Complete Training Pipeline
trained_classifier, vect= train_pipeline('data/Q1_Open_scored.csv')

FileNotFoundError: File b'data/Q1_Open_scored.csv' does not exist

In [12]:
%%time
x = 'This is impacting our business and it is essential that we work towards rectifying it!'
print(predict_pipeline(x,vect,trained_classifier))

Text After Preprocessing impact business essential work rectify


AttributeError: 'BernoulliNB' object has no attribute 'pre'

In [13]:
type(trained_classifier)

sklearn.naive_bayes.BernoulliNB

In [14]:
vect.get_feature_names()

['affect',
 'critical',
 'get',
 'impact',
 'important',
 'information',
 'late',
 'make',
 'meet',
 'miss',
 'need',
 'team',
 'time',
 'update',
 'work']