In [1]:
import pandas as pd
import hazm
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor

from tqdm import tqdm
import re

In [2]:
df = pd.read_csv("mobile_phone_dataset.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df['brand'] = df.apply(lambda row : row['brand'].split('::')[0], axis = 1) 

In [15]:
stopwords = hazm.utils.stopwords_list()
# preprocessing item name 
def preprocess_name(title_Col):
    preprocessed_names = []
    tokenizer = hazm.WordTokenizer()
    for sentence in tqdm(title_Col.values):
        sent = sentence
        sent = ' '.join(e.lower().strip() for e in sent.split() if e not in stopwords)
        sent = tokenizer.tokenize(sent)
        preprocessed_names.append(sent)
    return preprocessed_names

# preprocessing item description 
def preprocess_desc(desc_col):
    preprocessed_descs = []
    tokenizer = hazm.WordTokenizer()
    normilizer = hazm.Normalizer()
    for sentence in tqdm(desc_col.values):
        sent = sentence
        sent = sent.replace('/', ' ')
        sent = ' '.join(e.lower().strip() for e in sent.split() if e not in stopwords)
        sent = sent.replace('.', '')
        sent = normilizer.normalize(sent)
        sent = tokenizer.tokenize(sent)
        preprocessed_descs.append(sent)
    return preprocessed_descs

df['preprocessed_names'] = preprocess_name(df['title'])
df['preprocessed_descs'] = preprocess_desc(df['desc'])
df['preprocessed_descs']

100%|██████████| 59189/59189 [00:01<00:00, 36474.57it/s]
100%|██████████| 59189/59189 [00:09<00:00, 6054.60it/s]


0        [سلامیه, گوشیه, ۶۳۰۳, سالم, دوتا, خط, کوچیک, ا...
1            [درحد, نو, سالم, اصلى, ضربه, مهلت, تست, میدم]
2                          [گوشى, تمیز, هفته, کارکرده, آک]
3        [گلس, کارت, اپل, ای, دی, لوازم, جانبی, اصلی, م...
4               [تمیز, ۱, خط, خش, همراه, گلاس, قاب, محافظ]
                               ...                        
59184    [گوشی, موبایل, ایفون, ۴s, ۳۲, گیگ, سفید, سالم,...
59185    [گوشی, ال, جی, سالم, وبدون, کوچکترین, ایرادوبد...
59186          [جعبه, ولوازم, llaاصل, خط, وخش, نشده, سالم]
59187    [سالم, تمیز, لوازم, اندروید, ۵, باتری, اورجینا...
59188    [فروش, معاوضه, فوری, فوری, سفید, تمیز, همراه, ...
Name: preprocessed_descs, Length: 59189, dtype: object

In [26]:
print('Removed {} rows' .format(len(df[df.price<=0])))
df = df[df.price > 0].reset_index(drop=True)

Removed 5888 rows


In [27]:
y_df = np.log1p(df['price'])
df.drop(['price'], axis=1, inplace=True)

x_train, x_test, y_train, y_test =train_test_split(df, y_df, test_size=0.2, random_state=10)
print('DF size: {}, Train size: {}, Test size: {}'.format(df.shape, x_train.shape, x_test.shape))

DF size: (53301, 6), Train size: (42640, 6), Test size: (10661, 6)


In [29]:
vectorizer = CountVectorizer(lowercase=False, binary=True)
train_brand_oneHot = vectorizer.fit_transform(df['brand'].values) 
cv_brand_oneHot = vectorizer.transform(x_train['brand'].values)
test_brand_oneHot = vectorizer.transform(x_test['brand'].values)