In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [2]:
training_data = pd.read_csv('TrainingDataset.csv')
testing_data = pd.read_csv('TestingDataset.csv')

In [3]:
training_data.head(5)

Unnamed: 0,tweet,subtask_a,subtask_b,subtask_c
0,१९६६ साली छत्रपती शिवाजी महाराज या जागतिक दर्ज...,Offensive,TIN,GRP
1,भक्तांची आधुनिक झासी कंगना रानावत हिचे झाशीच्य...,Offensive,TIN,GRP
2,शांत राहिले की ही बाळासाहेबांची सेना नाही अन् ...,Offensive,TIN,GRP
3,फडणवीस सरकार असताना आरक्षनाचे राजकारण करुण तों...,Offensive,TIN,GRP
4,मादरचोद रांडाची औलाद आहात तुम्हीं साले गुंडे म...,Offensive,TIN,GRP


In [4]:

import re
import pandas as pd


# pre-processing the data
def clean_text(row, options):

    if options['lowercase']:
        row = str(row).lower()

    if options['strip_spaces']:
        row = str(row).strip()

    if options['remove_url']:
        row = str(row).replace('http\S+|www.\S+', '')

    if options['remove_mentions']:
        row = re.sub("@[A-Za-z0-9]+","@USER",row)

    if options['remove_newline']:
        row = re.sub(r'\n',' ',row)

    if options['remove_tab']:
        row = re.sub(r'\t',' ',row)

    if options['remove_english']:
        row = re.sub("[A-Za-z0-9]+","",row)

    if options['add_USER_tag']:
        row = re.sub("@","@USER",row)

    if options['remove_specials']:
        row = re.sub('[+,-,_,=,/,<,>,!,#,$,%,^,&,*,\",:,;,.,' ',\t,\r,\n,\',|]','',row)

    if options['remove_Quotes']:
        row = re.sub("'","",row)

    return row

clean_config = {
    'remove_url': True,
    'remove_mentions': True,
    'decode_utf8': True,
    'lowercase': True,
    'remove_english': True,
    'remove_specials': True,
    'add_USER_tag': True,
    'remove_newline':True,
    'remove_tab':True,
    'strip_spaces':True,
    'remove_Quotes':True
    }


def demoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00010000-\U0010ffff"
                               "]+", flags=re.UNICODE)
    return(emoji_pattern.sub(r'', text))


def main():
    # csv file
    #input_file = 'Data/Fully_Annotated.csv'

    dataset_df = testing_data

    #dataset_df = pd.DataFrame(dataset)

    dataset_df = dataset_df[["tweet",'subtask_a','subtask_b','subtask_c']]
    #, "subtask_a", "subtask_b", "subtask_c"

    #lowe case conversion
    dataset_df['tweet'] = dataset_df['tweet'].str.lower()

    # calling pre-processing function
    dataset_df['tweet'] = dataset_df['tweet'].apply(clean_text, args=(clean_config,))

    #stripping leading and trailing whitespaces
    dataset_df['tweet'] = dataset_df['tweet'].str.strip()

    #remove emojis - not working
    dataset_df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

    #remove emojis - working
    dataset_df['tweet'] = dataset_df['tweet'].apply( lambda x : demoji(x))

    # convert df to csv
    dataset_df.to_csv('./testing_cleaned.csv',index = False)

if __name__ == "__main__":
    main()

In [5]:
training_data_cleaned = pd.read_csv("training_cleaned.csv")

In [6]:
training_data_cleaned.head(5)
tf = TfidfVectorizer(ngram_range=(1,2))
X = tf.fit_transform(training_data_cleaned['tweet'].values.astype('U'))
print(X)

  (0, 10988)	0.3028838009470212
  (0, 17722)	0.29449079111026455
  (0, 9467)	0.3028838009470212
  (0, 6138)	0.28763320275240184
  (0, 13195)	0.3028838009470212
  (0, 14795)	0.2442779258081979
  (0, 6953)	0.2270980532885134
  (0, 18890)	0.3028838009470212
  (0, 10857)	0.19263397363819082
  (0, 10934)	0.15426723908734097
  (0, 17666)	0.15908360539244384
  (0, 9415)	0.17857213453181142
  (0, 6113)	0.17108717964377632
  (0, 13153)	0.12165637184914382
  (0, 14790)	0.2213000449458429
  (0, 6950)	0.2223903416659931
  (0, 18889)	0.3028838009470212
  (1, 13874)	0.2656398347544727
  (1, 11536)	0.22836640756141807
  (1, 9158)	0.2656398347544727
  (1, 15842)	0.2656398347544727
  (1, 9565)	0.2656398347544727
  (1, 11670)	0.259454077191973
  (1, 16093)	0.259454077191973
  (1, 6247)	0.2421229205207951
  :	:
  (4621, 18391)	0.10878160445193925
  (4621, 14672)	0.11892852827277796
  (4621, 11784)	0.09998713841673679
  (4621, 5471)	0.10116240328691807
  (4621, 4048)	0.08620335178716713
  (4621, 8654)	0.1

In [7]:
#X = training_data_cleaned.drop(['subtask_a','subtask_b','subtask_c'],axis=1).values.astype('U')
y = training_data_cleaned['subtask_c'].values.astype('U')


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42,shuffle=True)

In [9]:
print(X_train.shape)
print(y_train.shape)

(3467, 19089)
(3467,)


In [10]:
print(X_test.shape)
print(y_test.shape)

(1156, 19089)
(1156,)


In [11]:
X_train

<3467x19089 sparse matrix of type '<class 'numpy.float64'>'
	with 54443 stored elements in Compressed Sparse Row format>

In [12]:

RF = RandomForestClassifier(random_state=42)
    # Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    # Number of features to consider at every split
max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 220, num=11)]
    #max_depth.append(None)
    # Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
bootstrap = [True, False]
    # Create the random grid
random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=RF, param_distributions=random_grid, n_iter=100, cv=3, verbose=2,
                                   random_state=42, n_jobs=-1)




In [13]:
testing_data_cleaned = pd.read_csv('testing_cleaned.csv')
testing_data_cleaned.head(5)

Unnamed: 0,tweet,subtask_a,subtask_b,subtask_c
0,राम कदम वागण्यात नाही तर बोलण्यात चुकला बीजेपी...,Offensive,TIN,GRP
1,हीच का तुमची शिवसेने चि शिकवण आपली आई म्हणजे द...,Offensive,TIN,GRP
2,हे वाचा गाढवांनो आणि हे ही सांगा की तुमच्या मॅ...,Offensive,TIN,GRP
3,भक्त आंधळे असतात मूर्खा ना काही कळत नाही,Offensive,TIN,GRP
4,वर्षे गोट्या खेडत बसले होते काय साहेब तुम्ही क...,Offensive,TIN,IND


In [14]:
testing_data_cleaned.head(5)
unseen_y = testing_data_cleaned['subtask_c']
unseen_X = tf.transform(testing_data_cleaned['tweet'].values.astype('U'))
#print(X)

In [15]:
def train_test_TFIDF(X_train,y_train,X_test,y_test,unseen_X,unseen_y,model):
    print("TFIDF + ", model)
    #model = make_pipeline(TfidfVectorizer(ngram_range=(1, 5)), model)

#     X_train = train_data['tweet'].values.astype('U')
#     y_train = train_data['subtask_a'].values.astype('U')

#     X_test = test_data['tweet'].values.astype('U')
#     y_test = test_data['subtask_a'].values.astype('U')

    model.fit(X_train, y_train)

    labels = model.predict(X_test)

    print("training Accuracy:", metrics.accuracy_score(y_test, labels) * 100)

    cm = confusion_matrix(y_test, labels)
    print("Confusion matrix\n", cm)
    print(classification_report(y_test, labels, digits=4))
    print("\n\n")
    
    test_labels = model.predict(unseen_X)
    
    print("testing accuracy:",metrics.accuracy_score(unseen_y,test_labels) * 100)
    cm = confusion_matrix(unseen_y, test_labels)
    print("Confusion matrix\n", cm)
    print(classification_report(unseen_y, test_labels, digits=4))
    print("\n\n")

In [None]:
train_test_TFIDF(X_train,y_train,X_test,y_test,unseen_X,unseen_y,rf_random)
print(rf_random.best_params_)
print("End of RFC")

TFIDF +  RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 31, 52, 73, 94, 115,
                                                      136, 157, 178, 199, 220],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)
Fitting 3 folds for each of 100 candidates, totalling 300 fits




training Accuracy: 75.69204152249135
Confusion matrix
 [[ 20  15   1  15]
 [ 11 230  10 158]
 [  2   7   2   4]
 [  1  57   0 623]]
              precision    recall  f1-score   support

         GRP     0.5882    0.3922    0.4706        51
         IND     0.7443    0.5623    0.6407       409
         OTH     0.1538    0.1333    0.1429        15
         nan     0.7788    0.9148    0.8413       681

    accuracy                         0.7569      1156
   macro avg     0.5663    0.5007    0.5239      1156
weighted avg     0.7501    0.7569    0.7449      1156




