## Import the necessary libraries

In [1]:
import pandas as pd
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import re
from langdetect import detect

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

In [3]:
def keep_alpha_num(x):
    s = re.sub(r'[.,"\'-_?:!#;]', '', x)
    s = re.sub(r"[\([{})\]]", "", s)
    s = ' '.join(s.split())
    return s

## Start the Data Ingestion Stage

In [4]:
file_path = 'data/Coding_Challenge_NLP/training.csv'

df = pd.read_csv(file_path, header=0, names=['Source_Id', 'Source', 'Sentiment', 'Feedback'])

## Data Exploration and Preprocessing Stage

This exploratory phase is the stage where you are graphing things, testing things on small sets of the data, summarizing simple statistics, and getting rough ideas of what hypotheses you might want to pursue further.

In [5]:
df.Sentiment.value_counts()

Sentiment
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

1. The dependent variable seem to be well - distributed, so there is no need for any Imbalance adjustment.
2. Let us also look for possible na values, if we should in anyway deal with them.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Source_Id  74681 non-null  int64 
 1   Source     74681 non-null  object
 2   Sentiment  74681 non-null  object
 3   Feedback   73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [7]:
df.isna().sum()
df.dropna(inplace=True)

In [8]:
print(df['Feedback'][59])

 . . [  


1. WE have feedback like the one presented above that do not contain any string character that makes sense to language models that we shall be using for training Purposes. Although they are not empty, they have no relevance and should be removed before feeding to the model for training.

2. Also, we should first remove all the non-numeric and non-alphabetic characters.

3. Remove Duplicates, possibly also very similar sentences

4. We shall also look if all the feedback belongs to the same language or different ones

In [9]:
df['clean_feedback'] = df['Feedback'].apply(lambda x: keep_alpha_num(x))
df['len_feedback'] = df['clean_feedback'].apply(lambda x: len(str(x)))
df = df[df['len_feedback']>10]
df.drop_duplicates(subset=['clean_feedback'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())
x= df['clean_feedback'].tolist()
results = pool.map(detect, x)
pool.close()
df['lang'] = results
print(df['lang'].value_counts())

lang
en    60842
nl      355
af      337
da      263
no      249
fr      219
cy      208
so      185
it      176
sv      165
et      149
ca      136
tl      120
es       99
ro       83
id       72
de       63
pt       46
fi       44
pl       41
sk       34
hu       30
sw       25
sq       23
tr       23
sl       23
hr       16
cs       16
ru       13
lt        8
lv        6
bg        5
vi        3
mk        2
th        2
Name: count, dtype: int64


### Here we see that there are other languages also present in the dataset other than English, so we must account for them as well, while Training the model

## Model Training Phase




In [13]:
file_path = 'data/Coding_Challenge_NLP/validation.csv'

df = pd.read_csv(file_path, header=0, names=['Source_Id', 'Source', 'Sentiment', 'Feedback'])