In [44]:
# BERT REF : https://github.com/hooshvare/parsbert#parsbert-v20-sentiment-analysis
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install plotly

Collecting plotly
  Downloading plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=c2e9e88792ee15cce47703a80c4f973f4419ad823f8a3d4378c5d0333d28655b
  Stored in directory: c:\users\ali\appdata\local\pip\cache\wheels\ac\cb\8a\b27bf6323e2f4c462dcbf77d70b7c5e7868a7fbe12871770cf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.14.3 retrying-1.3.3


## Import

In [73]:
DATA_PATH = '../Dataset/DeepSentiPers-original.csv'
import pandas as pd
import hazm
import csv, re, pickle

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split

In [1]:
def print_duplicate_by_column(df, col_name):
    ids = df[col_name]
    dup_df = df[ids.isin(ids[ids.duplicated()])].sort_values(col_name)
    pd.set_option('display.max_rows', df.shape[0]+1)
    return dup_df

# Load and analyze data

In [3]:
df = pd.read_csv(DATA_PATH,names =['comment', 'sentiment'], encoding='utf-8')
df.head()

Unnamed: 0,comment,sentiment
0,گوشي خوبيه(قوي و شکيل و زيبا و بي رقيب)البته ت...,1
1,سلام خيلي خوبه بخرين.,2
2,از جمله قابلیت‌های ارتباطی HTC Desire SV می‌تو...,0
3,نهایتا، یک دوربین VGA نیز برای انجام مکالمات ...,0
4,من حدوداً ۱ ماهي‌ که مي‌شه اين گوشي رو دارم، ر...,1


### Check None values

In [5]:

# print data information
print('data information')
print(df.info(), '\n')

# print missing values information
print('missing values stats')
print(df.isnull().sum(), '\n')


data information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5561 entries, 0 to 5560
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    5561 non-null   object
 1   sentiment  5561 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 87.0+ KB
None 

missing values stats
comment      0
sentiment    0
dtype: int64 



## Check duplicates

In [6]:
print_duplicate_by_column(df,'comment')

Unnamed: 0,comment,sentiment
57,"""محدوديت بزرگترين کابوس بشر است!",0
5231,"""محدوديت بزرگترين کابوس بشر است!",0
1030,( با بهره گيري از مک.,0
1704,( با بهره گيري از مک.,0
3282,(البته الان با وجود برنامه سيب اين کار راحتتر ...,1
4615,(البته الان با وجود برنامه سيب اين کار راحتتر ...,0
3009,)بگذريم.,0
2520,)بگذريم.,0
4017,***** آپديت ***** سي و يک شهريور : پردازنده ي ...,0
1329,***** آپديت ***** سي و يک شهريور : پردازنده ي ...,1


As you can see, there are some duplicates in the comment column. Some of them have different sentiment values and some have the same sentiment.
To prevent confusing our model we remove unvalid rows that have same comment with different sentiments and also will deduplicate the data rows

In [16]:
#remove duplicate rows
df = df.drop_duplicates(subset = None, keep='first')

#remove unvalid rows
df = df.drop_duplicates(subset='comment')

In [20]:
print_duplicate_by_column(df,'comment')

Unnamed: 0,comment,sentiment


In [21]:
print('data information')
print(df.info(), '\n')

data information
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5359 entries, 0 to 5560
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    5359 non-null   object
 1   sentiment  5359 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 125.6+ KB
None 



# Clean Text

In [54]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def CleanPersianText(text):
    _normalizer = hazm.Normalizer()
    #_lemmatizer = hazm.Lemmatizer()
    text = text.strip()
    text = cleanhtml(text)
    text = _normalizer.normalize(text)
    text = re.sub(r'[^a-zA-Z0-9آ-ی۰-۹ ]', ' ', text)
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text







'''

# turn a doc into clean tokens
def clean_doc(doc):
    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer
    tokenized = word_tokenize(doc)  # Tokenize text
    tokens = []
    for t in tokenized:
      temp = t
      for p in puncs:
        temp = temp.replace(p, '')
      tokens.append(temp)
    # tokens = [w for w in tokens if not w in stop_set]    # Remove stop words
    tokens = [w for w in tokens if not len(w) <= 1]
    tokens = [w for w in tokens if not w.isdigit()]
    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer
    tokens = ' '.join(tokens)
    return tokens
'''

"\n\n# turn a doc into clean tokens\ndef clean_doc(doc):\n    doc = normalizer.normalize(doc) # Normalize document using Hazm Normalizer\n    tokenized = word_tokenize(doc)  # Tokenize text\n    tokens = []\n    for t in tokenized:\n      temp = t\n      for p in puncs:\n        temp = temp.replace(p, '')\n      tokens.append(temp)\n    # tokens = [w for w in tokens if not w in stop_set]    # Remove stop words\n    tokens = [w for w in tokens if not len(w) <= 1]\n    tokens = [w for w in tokens if not w.isdigit()]\n    tokens = [lemmatizer.lemmatize(w) for w in tokens] # Lemmatize sentence words using Hazm Lemmatizer\n    tokens = ' '.join(tokens)\n    return tokens\n"

In [55]:
df['comment'] = df['comment'].map(CleanPersianText)

# Comment Length

In [56]:
df['comment_len'] =df['comment'].apply(lambda comment: len(hazm.word_tokenize(comment)))
min_max_len = df['comment_len'].min(), df['comment_len'].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

Min: 1 	Max: 335


In [57]:
df.sort_values(by='comment_len')

Unnamed: 0,comment,sentiment,comment_len
2325,مشکل,0,1
2661,سلام,0,1
1219,قیمت,0,1
436,عالی,2,1
1246,عالیه,2,1
2680,عالیه,2,1
5034,GearsOfWar,0,1
5091,سلام,0,1
5111,سلام,0,1
3583,ویدئو,0,1


In [None]:
# remove comments with the length of fewer than 1 word 
df['comment_len'] = df['comment_len'].apply(lambda len_t: len_t if 0 < len_t  else None)
df = df.dropna(subset=['comment_len'])
df = df.reset_index(drop=True)

# Stats

In [80]:
def show_comment_length_distribution(df):
    fig = go.Figure()

    fig.add_trace(go.Histogram(
        x=df['comment_len']
    ))

    fig.update_layout(
        title_text='Comment lenght distribution',
        xaxis_title_text='Word Count',
        yaxis_title_text='Frequency',
        bargap=0.2,
        bargroupgap=0.2)

    fig.show()
show_comment_length_distribution(df)

Majority of comments have lenght of less than 100. So, we can use length of 128 for our language model and drop the rest of words in the sequence.

In [79]:
def show_sentiment_distribution(df):
    fig = go.Figure()

    groupby_sentiment = df.groupby('sentiment')['sentiment'].count()

    fig.add_trace(go.Bar(
        x=list(sorted(groupby_sentiment.index)),
        y=groupby_sentiment.tolist(),
        text=groupby_sentiment.tolist(),
        textposition='auto'
    ))

    fig.update_layout(
        title_text='Sentiment distribution',
        xaxis_title_text='Sentiment',
        yaxis_title_text='Frequency',
        bargap=0.2,
        bargroupgap=0.2)

    fig.show()
show_sentiment_distribution(df)

# HANDLE IMBALANCING

# Train,Validation,Test split

In [74]:
train, test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['sentiment'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['sentiment'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['comment'].values.tolist(), train['sentiment'].values.tolist()
x_valid, y_valid = valid['comment'].values.tolist(), valid['sentiment'].values.tolist()
x_test, y_test = test['comment'].values.tolist(), test['sentiment'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)

(4503, 3)
(501, 3)
(556, 3)


In [81]:
train.to_csv('../Dataset/train.csv')
test.to_csv('../Dataset/test.csv')
valid.to_csv('../Dataset/valid.csv')