<h2>Importing Libraries & Languages that are required</h2>

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import sys
from indicnlp import common

In [3]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/Users/sudheera/workspace/python/CS584/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"/Users/sudheera/workspace/python/CS584/indic_nlp_resources"

In [4]:
# Add library to Python path
sys.path.append(r'{}\src'.format(INDIC_NLP_LIB_HOME))

# Set environment variable for resources folder
common.set_resources_path(INDIC_NLP_RESOURCES)

In [5]:
from inltk.inltk import setup
setup("en")

In [6]:
import stanza
stanza.download('en')
stanza_nlp = stanza.Pipeline('en')

<h2>Importing Data</h2>

In [7]:
import pandas as pd
import numpy as np
import re

train_data = pd.read_csv("/Users/sudheera/workspace/python/CS584/Multilingual_Abusive_Comment_Identification/train_set.csv", delimiter=",")
test_data = pd.read_csv("/Users/sudheera/workspace/python/CS584/Multilingual_Abusive_Comment_Identification/test_set.csv", delimiter=",")

<h2>Exploring Data</h2>

In [8]:
train_data["label"].value_counts(normalize=True)

0    0.52987
1    0.47013
Name: label, dtype: float64

In [9]:
train_data["language"].value_counts(normalize=True)

Hindi         0.461896
Telugu        0.145873
Marathi       0.108330
Tamil         0.104500
Malayalam     0.061598
Bengali       0.034336
Kannada       0.020966
Odia          0.016501
Gujarati      0.013274
Haryanvi      0.013250
Bhojpuri      0.008727
Rajasthani    0.006568
Assamese      0.004180
Name: language, dtype: float64

In [10]:
train_data.isna().any().any()

False

In [11]:
test_data.isna().any().any()

False

In [12]:
print(train_data.shape)
print(test_data.shape)

(665042, 9)
(74253, 8)


In [13]:
train_data

Unnamed: 0,language,post_index,commentText,report_count_comment,report_count_post,like_count_comment,like_count_post,label,val
0,Hindi,238566,शायद योगी जी है,0,0,1,1,0,0
1,Hindi,7009,Tingri h to putri tu.. .,0,0,0,0,0,0
2,Hindi,404648,Saale Tu kon sa pagal Nahi h . Teri comment pa...,0,0,0,0,1,0
3,Hindi,5057,girl 😘😘 aaj ke baad msg ki to maar daluggi,0,0,0,0,0,0
4,Hindi,107146,Free fire pubg ka baap ha kutta sala kamina,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...
665037,Hindi,372573,Yr bhai ye launda jahar h...,0,0,1,530,0,0
665038,Hindi,271190,⃢☠︎︎🦅🅡︎ ⃢🅴︎ ⃢🅗︎⃢ 🅰︎🅽︎⑅⃝💜✔︎ koshish kr lo dodne...,0,0,0,0,0,0
665039,Hindi,356484,बेरीनाईस पिक जी 🇮🇳 *🙏 🙏,0,0,0,0,0,0
665040,Hindi,356897,Amit mar do sale ko,0,0,0,0,0,0


In [14]:
test_data

Unnamed: 0,Id,language,post_index,commentText,report_count_comment,report_count_post,like_count_comment,like_count_post
0,2,Bengali,182442,Bichna theke agun berochhe re tar modhhyeu ami...,0,0,0,0
1,3,Hindi,406921,JYOTI💏 chut ka pani,0,0,0,0
2,4,Hindi,233255,Kuth tik n h,0,0,0,0
3,5,Telugu,219308,Ekkada ap valaki kallu guddi ah?😠sarigga chuda...,0,0,0,0
4,6,Hindi,269812,pagal khi ke gadhe me dha aata h bachho se kuc...,0,0,0,0
...,...,...,...,...,...,...,...,...
74248,74250,Hindi,358105,Bhabhi ji banne se pahle sali ka kapda Palo bh...,0,0,0,0
74249,74251,Hindi,275919,Nice gannd dogi kya,0,0,0,0
74250,74252,Hindi,33762,Tumhare boobs ke Dam per pahchan Hai Tumhara,0,0,0,0
74251,74253,Marathi,381085,Tu उगाच उडता तिर घेत aahes,0,0,0,0


<h2>Data Pre-Processing</h2>

In [15]:
import re
import nltk
import unicodedata
from bs4 import BeautifulSoup
import emoji

def pre_process(string):
    # 1. Remove HTML tags
    textOnly = BeautifulSoup(string, features="html.parser").get_text() 


    # 2. Remove Email IDs, URLs and numbers
    noEmail = re.sub(r'([\w\.-]+@[\w\.-]+\.\w+)','',textOnly)
    
    noUrl = re.sub(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]| \
        [a-z0-9.\-]+[.][a-z]{2,4}/|[a-z0-9.\-]+[.][a-z])(?:[^\s()<>]+|\(([^\s()<>]+| \
        (\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))','', noEmail)

    # 3. Remove Emojis
    emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
    noEmoji = emoji_pattern.sub(r'', noEmail)

#     noEmoji = re.sub(emoji.get_emoji_regexp(), r"", noEmail) 
    
    return noEmoji

In [16]:
train_data["commentText"] = train_data["commentText"].apply(pre_process)

In [17]:
test_data["commentText"] = test_data["commentText"].apply(pre_process)

In [18]:
labels = train_data[["label"]]
train_data_1 = train_data.drop("label", axis=1)

<h2>Data Processing</h2>

In [19]:
from inltk.inltk import tokenize
from indicnlp.tokenize import indic_tokenize
from inltk.inltk import identify_language

def get_tokens(string, language):
    try:
        if language == "Hindi":
                if identify_language(string) == "hi":
                    return tokenize(string, "hi")
                else:
                    return tokenize(string, "hi-en")
        elif language == "Telugu":
            return tokenize(string, "te")
        elif language == "Marathi":
            return tokenize(string, "mr")
        elif language == "Tamil":
            return tokenize(string, "ta")
        elif language == "Malayalam":
            return tokenize(string, "ml")
        elif language == "Bengali":
            return tokenize(string, "bn")
        elif language == "Kannada":
            return tokenize(string, "kn")
        elif language == "Odia":
            return tokenize(string, "or")
        elif language == "Gujarati":
            return tokenize(string, "gu")
        else:
            return indic_tokenize.trivial_tokenize(string)
    except:
        return indic_tokenize.trivial_tokenize(string)

In [20]:
from indicnlp.tokenize import indic_tokenize

def tokenize(string):
    return indic_tokenize.trivial_tokenize(string)

In [21]:
train_data["commentText"] = train_data["commentText"].apply(tokenize)
test_data["commentText"] = test_data["commentText"].apply(tokenize)

In [None]:
import time

start_time = time.clock()
train_data['commentText'] = train_data.apply(lambda x: get_tokens(x.commentText, x.language), axis=1)
end_time = time.clock()

print(end_time-start_time)

In [None]:
test_data['commentText'] = test_data.apply(lambda x: get_tokens(x.commentText, x.language), axis=1)

In [None]:
import re
train_data["commentText"] = train_data["commentText"].apply(lambda x: re.sub("▁", "",x))
test_data["commentText"] = test_data["commentText"].apply(lambda x: re.sub("▁", "",x))

In [None]:
import stanza

nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
# train_data["commentText"] = train_data["commentText"].apply(lambda x: nlp_stanza(x))
# test_data["commentText"] = test_data["commentText"].apply(lambda x: nlp_stanza(x))

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

def create_vector(train_data, test_data): 
    tfidf_vectorizer = TfidfVectorizer(norm = 'l2', lowercase = False, min_df = 0, stop_words='english',
                                       use_idf = True, smooth_idf = False, sublinear_tf = True, \
                                       ngram_range=(1,2))
    train_vector = tfidf_vectorizer.fit_transform(train_data)

    test_vector = tfidf_vectorizer.transform(test_data)
    
    return train_vector, test_vector

In [24]:
train_data["commentText"] = train_data["commentText"].apply(lambda x: " ".join(x))
test_data["commentText"] = test_data["commentText"].apply(lambda x: " ".join(x))

In [None]:
# Import new libraries
import matplotlib.pyplot as mtp
%matplotlib inline
import wordcloud
from wordcloud import WordCloud, STOPWORDS

# Create a wordcloud of the movie genre
title_corpus = ' '.join(train_data["commentText"])
title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='black', height=2000, width=4000).generate(title_corpus)

# Plot the wordcloud
mtp.figure(figsize=(16,8))
mtp.imshow(title_wordcloud)
mtp.axis('off')
mtp.show()

In [25]:
train_vector, test_vector = create_vector(train_data["commentText"], test_data["commentText"])

<h2>Logistic Regression</h2>

In [26]:
from sklearn.linear_model import LogisticRegression

# logreg = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
logreg = LogisticRegression(C=2, dual=False, solver='liblinear', max_iter=10000)
logreg.fit(train_vector, train_data["label"])
y_pred = logreg.predict(test_vector)

In [28]:
write_to_file(y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)

Cross_Validation(10, model, train_vector, train_data["label"] )

<h2>Random Forest Classifier</h2>

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(train_vector, train_data["label"])
y_pred = rfc.predict(test_vector)

In [None]:
write_to_file(y_pred)

<h2>K-Nearest Neighbor</h2>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(train_vector, train_data["label"])
y_pred = knn.predict(test_vector)

In [None]:
write_to_file(y_pred) #submission4.csv

<h2>Grid Search CV</h2>

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

knn = KNeighborsClassifier()
rfc = RandomForestClassifier()

forest_params = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,14))}]
clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy')

clf.fit(train_vector, train_data["label"])
 

<h2>Cross Validation Method</h2>

In [None]:
# Cross Validation (Tested Features)

from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

def Cross_Validation(n_splits, model, X, y ):
    cv = KFold(n_splits=n_splits, random_state=1, shuffle=True)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)
    return np.mean(scores)


# cv = KFold(n_splits=10, random_state=1, shuffle=True)
# model = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
# scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(_train_data, train_labels, test_size=0.33)

<h2>Saving to file</h2>

In [None]:
#Method to write to file
import csv 

def write_to_file(predictions):
   
    # field names 
    fields = ['Id', 'Expected'] 

    # data rows of csv file 
    rows = []
    for index, value in enumerate(predictions):
        rows.append([index+2, value])

    # name of csv file 
    filename = "/Users/krishna/workspace/python/CS584/Multilingual_Abusive_Comment_Identification/submission7.csv"

    # writing to csv file 
    with open(filename, 'w') as csvfile: 
        # creating a csv writer object 
        csvwriter = csv.writer(csvfile) 

        # writing the fields 
        csvwriter.writerow(fields) 

        # writing the data rows 
        csvwriter.writerows(rows)

In [None]:
# train_data[train_data["language"]=="Hindi"].iloc[0]["commentText"]

In [None]:
# train_labels = train_data.label
# # train_data = train_data.iloc[:,1:-2]
# _train_data = train_data.commentText

In [None]:
# from emot.emo_unicode import EMOTICONS_EMO
# import emoji

# def strip_emoji(string):
#     return re.sub(emoji.get_emoji_regexp(), r"", string)
# def remove_emoji(string):
#     emoji_pattern = re.compile("["
#                            u"\U0001F600-\U0001F64F"  # emoticons
#                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
#                            "]+", flags=re.UNICODE)
#     return emoji_pattern.sub(r'', string)

# def remove_emoticons(text):
#     emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS_EMO) + u')')
#     return emoticon_pattern.sub(r'', text)

In [None]:
# from inltk.inltk import get_embedding_vectors

# # get embedding for input words
# vectors = get_embedding_vectors("विश्लेषिकी विद्या", "hi")

# print(vectors)
# # print shape of the first word
# print("shape:", vectors[0].shape)

In [None]:
### Langugae Detection ###

# from textblob import TextBlob

# lang = TextBlob("Free fire pubg ka baap ha kutta sala kamina") 
# print(lang.detect_language())


# from langdetect import detect

# detect("Saale Tu kon sa pagal Nahi h")

In [None]:
### Transliterate ###

from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
input_text = "Free fire pubg ka baap ha kutta sala kamina"
print(UnicodeIndicTransliterator.transliterate(input_text,"hi","hi"))


from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
y = "Free fire pubg ka baap ha kutta sala kamina"
print(transliterate(y, sanscript.ITRANS, sanscript.DEVANAGARI))

from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from indicnlp.transliterate.acronym_transliterator import LatinToIndicAcronymTransliterator
x = "शायद योगी जी है"
print(UnicodeIndicTransliterator.transliterate(x,"hi","eng"))



In [None]:
### Tokenize ###

from indicnlp.tokenize import indic_tokenize
hindi_text = "Free fire pubg ka baap ha kutta sala kamina"
print(indic_tokenize.trivial_tokenize(hindi_text))


# from textblob import TextBlob
# from inltk.inltk import tokenize

# def get_tokens(string, language):
#     if language == "Hindi":
#         tokens = tokenize(string, "hi")
#         return tokens

from inltk.inltk import tokenize

hindi_text = """प्राचीन काल में विक्रमादित्य नाम के एक आदर्श राजा हुआ करते थे।
अपने साहस, पराक्रम और शौर्य के लिए  राजा विक्रम मशहूर थे। 
ऐसा भी कहा जाता है कि राजा विक्रम अपनी प्राजा के जीवन के दुख दर्द जानने के लिए रात्री के पहर में भेष बदल कर नगर में घूमते थे।"""

hindi_text = "फ्री फायर पब का बाप हा कुट्टा साला कमीना"

# tokenize(input text, language code)
tokenize(hindi_text, "hi")
