# Sentiment analysis for Hindi/English code-mixed text.
<hr/>

### This file cleans the data, pre-processes it and creates a new csv that can be used to train the models.

In [15]:
# All the main imports
import pandas as pd
import numpy as np
import re
import json


# All the nltk imports
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize


In [16]:
# Read the appropriate dataset file
df = pd.read_csv('train_data.csv')

In [17]:
print('The data distribution between different labels: ')
df['Label'].value_counts()


The data distribution between different labels: 


neutral     5264
positive    4634
negative    4102
Name: Label, dtype: int64

In [4]:
print('Check if the any null data')
df.isnull().value_counts()

Check if the any null data


id     Sentence  Label
False  False     False    3000
dtype: int64

In [5]:
# Removing the null data
df = df[df['Sentence'].isnull() == False]
df = df[df['Label'].isnull() == False]

In [6]:
# Validate if all the null data is removed
df.isnull().value_counts()

id     Sentence  Label
False  False     False    3000
dtype: int64

### Removing all the languages except Hin & Eng as those token will not be useful in the training process

Words with `O` Language tag are special characters
Words with `EMT` Language tag are emojis
Words with `positive, negative, neutral` Language tag are miss labels.

Removing all these dataframe

In [7]:
# The summary of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3000 non-null   int64 
 1   Sentence  3000 non-null   object
 2   Label     3000 non-null   object
dtypes: int64(1), object(2)
memory usage: 93.8+ KB


### We will be performing following tasks on the data

1. Make all the data to lower case
2. Stemming the data
3. Lemmatizing the data
4. Removing stopwords
5. Removing the usernames 'Words with digits'

In [8]:
# Task1 : Turning the data to lower case
df['Sentence'] = df['Sentence'].str.lower()

In [9]:
# Task2 : Stemming
##  Define function to perform stemming on words

stemmer = PorterStemmer()

# def perform_stemming(token):
#     if token['Language'] == 'Hin':
#         perform_hin_stemming(token['Words'])
#     else:
#         perform_eng_stemming(token['Words'])

def perform_stemming(text):
    tokens = word_tokenize(text)
    stemmed_words = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_words)

def perform_eng_stemming(word):
    stemmed_word = stemmer.stem(word)
    return stemmed_word

def perform_hin_stemming(word):
    stemmed_word = re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word)
    return stemmed_word

## Perform stemming based on if the Language assigned in hindi or english
# df['Words'] = df['Words'].apply(lambda token: perform_stemming(token), axis=1)
df['Sentence'] = df['Sentence'].apply(perform_stemming)



In [10]:
# Task 3 : lemmatization

# Define function to perform lemmatization on words
# def perform_lemmatization(word):
#     lemmatized_words = lemmatizer.lemmatize(word, pos=wordnet.VERB)
#     return lemmatized_words

lemmatizer = WordNetLemmatizer()
def perform_lemmatization(text):
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(token, pos=wordnet.VERB) for token in tokens]
    return ' '.join(lemmatized_words)


df['Sentence'] = df['Sentence'].apply(perform_lemmatization)


In [11]:
# Task 4 : Remove stopwords

stop_words_set_eng = set(stopwords.words('english'))
stop_words_set_hin = set([
    'is', 'ke', 'ka', 'ek', 'hai', 'hain', 'ki', 'ko', 'mein', 'se', 'par', 'bhi', 'ke', 'liye', 'saath',
    'ho', 'kar', 'vale', 'vali', 'kuch', 'jo', 'to', 'hi', 'tak', 'ya', 'hote', 'hota', 'tha', 'the',
    'ab', 'jab', 'kahaa', 'kisi', 'ne', 'unke', 'uske', 'uski', 'usmein', 'uskoe', 'usse', 'iskay',
    'iski', 'ismein', 'iskoe', 'isse', 'tab', 'phir', 'jaise', 'jiske', 'jiskee', 'jismein', 'jiskoe',
    'jisse', 'yah', 'yahee', 'ye', 'vah', 'vahee', 've', 'kai', 'kul', 'door', 'parantu', 'aap', 'tum',
    'tumhara', 'tumhare', 'main', 'mera', 'mere', 'ham', 'hamara', 'hamare', 'apna', 'apne', 'khud',
    'yahan', 'vahan', 'sabka', 'sabke', 'kisi', 'kise', 'sabhi', 'sab', 'koi', 'kuch', 'kisi',
    'kisi', 'kisi', 'koi', 'dusra', 'any', 'any', 'aur', 'etc'
])

stop_words_set = stop_words_set_eng | stop_words_set_hin

# def check_if_stopwords(word):
#     return word not in stop_words_set


# # df['Words'] = df['Words'].apply(lambda token: check_if_stopwords(token))
# for i in stop_words_set:
#     df = df[df['Words'] != i]

def remove_stopwords(text):
    filtered_tokens = []
    tokens = word_tokenize(text)
    for t in tokens:
        if t not in stop_words_set:
            filtered_tokens.append(t)
    return ' '.join(filtered_tokens)

df['Sentence']= df['Sentence'].apply(remove_stopwords)

In [12]:
# Task 5: Remove usernames (Words with digits in it.)

# df['alpha'] = df['Words'].str.contains(r'\d', regex=True)
# df = df[df['alpha'] != True]

def remove_num(text):
    filtered_tokens = []
    tokens = word_tokenize(text)
    for t in tokens:
        if re.search(r'\d',t):
            continue;
        else:
            filtered_tokens.append(t)
    return ' '.join(filtered_tokens)

def remove_space(tokens):
    return tokens.strip()

df['Sentence']= df['Sentence'].apply(remove_num)
df['Sentence']= df['Sentence'].apply(remove_space)

In [13]:
df.info

<bound method DataFrame.info of          id                                           Sentence     Label
0     30258  prahladspatel modi mantrimand may samil honay ...  positive
1     16648  bkunalraj tajinderbagga jammupalchhin shehla r...  negative
2     28511  waglenikhil u saw cast religion nation saw tal...  negative
3     10466  delhipolic sir local polic station pe complain...   neutral
4     19266  maahi song kesari current favourit music melod...  positive
...     ...                                                ...       ...
2995  16859  rt mukeshsharmamla khushi nahi nayi sarkaar aa...  negative
2996   2294  music life thank chhote ustad salman ali post ...   neutral
2997  29819  vicki gilmour hmmmm realli sam outlaw someth a...   neutral
2998  34181  rssurjewala incindia gala faad nahi chillana c...  negative
2999  36603  lerki allah swt beha may ku lati chor diffah r...   neutral

[3000 rows x 3 columns]>

In [14]:
# Drop the extra columns
df.to_csv('data/final_data/validate_data_clean.csv', index=False)