In [1]:
!cp drive/My\ Drive/natural-language-processing/Comments_Data.zip .
!unzip Comments_Data.zip
!cp drive/My\ Drive/natural-language-processing/stopwords-farsi.txt .

Archive:  Comments_Data.zip
  inflating: test_nolabel_comments.csv  
  inflating: train_comments.csv      


### Loading Data
- I am going to read only 10000 rows because of low ram issues

In [2]:
import pandas as pd
pd.options.display.max_colwidth = 100

In [3]:
df = pd.read_csv('train_comments.csv', index_col='id', usecols=['id','title','comment','verification_status'],nrows=10000)
df.head()

Unnamed: 0_level_0,title,comment,verification_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,نسبت به قیمتش خوبه ولی یه کم کوچیکه,کوچیکه - کلید خاموش و روشنش مسخره اس,1
1,شومیز,خیلی بد فرمه انگار مانتوی بارداریه. اصلا با عکسی که گذاشتن مطابقت نداره,0
2,جنس بسیار ضعیفه,پیشنهادم اینه ک کسی نخره چون پولشو میندازه دور,0
3,رضایت از خرید,طعمش از بقیه کاپوچینو های فوری بهتره,0
4,ارسال جنس تقلبی,برای من چهار عدد ارسال شد که یکی از کارتن ها باز شده بود و شمع تقلبی داخلش گذاشته بودن و با چسب ...,1


In [4]:
df.describe()

Unnamed: 0,verification_status
count,10000.0
mean,0.1721
std,0.377486
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


### Data Cleaning
- Make text all lower case (Normalization)
- Punctuation Removal
- Numerical Value Removal
- Stop-words Removal
- Tokenization

In [4]:
df.comment.loc[0]

'کوچیکه - کلید خاموش و روشنش مسخره اس'

In [5]:
import re
import string

def clean_text(text):
  text = text.lower()
  text = re.sub('[.*؟،?\/]', '', text)  # remove .*?
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # remove punctuation
  text = re.sub('\w*\d\w*', '', text) # remove any number surrounded with a string
  return text

df.comment = df.comment.apply(lambda x: clean_text(x))
df.head(5)

Unnamed: 0_level_0,title,comment,verification_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,نسبت به قیمتش خوبه ولی یه کم کوچیکه,کوچیکه کلید خاموش و روشنش مسخره اس,1
1,شومیز,خیلی بد فرمه انگار مانتوی بارداریه اصلا با عکسی که گذاشتن مطابقت نداره,0
2,جنس بسیار ضعیفه,پیشنهادم اینه ک کسی نخره چون پولشو میندازه دور,0
3,رضایت از خرید,طعمش از بقیه کاپوچینو های فوری بهتره,0
4,ارسال جنس تقلبی,برای من چهار عدد ارسال شد که یکی از کارتن ها باز شده بود و شمع تقلبی داخلش گذاشته بودن و با چسب ...,1


### More Data Cleaning

In [6]:
def clean_text_more(text):
  text = re.sub('[''""...]', '', text)
  text = re.sub('\n', '', text)
  text = re.sub('\r', '', text)
  text = re.sub('[...]','',text).replace('\u200c', '')
  return text

df.comment = df.comment.apply(lambda x: clean_text_more(x))
df['comment'].head(50)

id
0                                                                     کوچیکه  کلید خاموش و روشنش مسخره اس
1                                  خیلی بد فرمه انگار مانتوی بارداریه اصلا با عکسی که گذاشتن مطابقت نداره
2                                                          پیشنهادم اینه ک کسی نخره چون پولشو میندازه دور
3                                                                    طعمش از بقیه کاپوچینو های فوری بهتره
4     برای من چهار عدد ارسال شد که یکی از کارتن ها باز شده بود و شمع تقلبی داخلش گذاشته بودن و با چسب ...
5     به نظر من نباید عکس  با خود جنس مقایرت داشته باشه  میشه هم طوری عکاسی بکنید که  به اشتباه نیفتند...
6     بیش از  ماه پیش  این سیستم را با توجه به نضرات کاربران برای بچه ها که در سالهای آخر دبیرستان هست...
7         میشه سایز  رو دوباره موجود کنیدمن دیر رسیدم به پیشنهاد شگفت انگیزشکاش دوباره شگفت انگیز بزاریدش
8                                                                                               راضی بودم
9                                          

In [7]:
re.sub('[...]','','من این ساعت رو خریداری کردم و بعد از تحویل در کمال شگفتی متوجه شدم که روی جعبه مشخصات ساعت دیگه ای نوشته شده که با ساعتی که ...').replace('\u200c', '')


'من این ساعت رو خریداری کردم و بعد از تحویل در کمال شگفتی متوجه شدم که روی جعبه مشخصات ساعت دیگه ای نوشته شده که با ساعتی که '

### Organizing the data
- Corpus which is already ready
- Document-Term matrix

In [8]:
df.to_pickle('corpus.pkl')

#### Document-Term Matrix

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.externals import joblib ## can use joblib for better performance

### Set a custom persian stop words
stop_words = open('stopwords-farsi.txt').readlines()
stop_words_to_list = [word.strip() for word in stop_words]

cv = TfidfVectorizer(stop_words=stop_words_to_list)
# vec.fit(df.comment)
# joblib.dump(vec, 'vec_count.joblib')
# vec = joblib.load('vec_count.joblib')
# df_cv = vec.transform(df.comment)
df_cv = cv.fit_transform(df.comment)
df_dtm = pd.DataFrame(df_cv.toarray(),columns=cv.get_feature_names())
df_dtm.index = df.index
df_dtm.tail()

Unnamed: 0_level_0,aali,ac,acer,actyon,adata,adataتا,adsl,after,air,airdots,airdotspro,aks,al,ali,alt,amoled,anc,anker,antutu,ar,arc,armor,aspire,asus,auto,aux,avalesh,awlie,az,aتا,bad,bag,band,bar,bb,beats,bekharin,benchmark,bi,bkخریدم,...,یکساعت,یکسال,یکسالزنجیراشم,یکساله,یکسالو,یکسالی,یکسان,یکسانه,یکسره,یکسری,یکش,یکطرفش,یکطرفه,یکم,یکماه,یکماهه,یکماهی,یکمرتبه,یکمش,یکمقدار,یکمم,یکمواسه,یکمی,یکمیل,یکنم,یکنواخت,یکنواخته,یکنواختی,یکنی,یکه,یکهبا,یکهفته,یککیسه,یکیش,یکیشو,یکیشون,یکیه,یگیش,یی,یییییییییییییییییییر
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
