In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from bs4 import BeautifulSoup
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [67]:
#veri setlerini yükleme
df = pd.read_csv('NLPlabeledData.tsv', delimiter="\t", quoting=3)

In [68]:
#verilere bakalım:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [69]:
len(df)

25000

In [70]:
len(df["review"])

25000

In [71]:
#stopwords'ü temizleme işlemi için stopwords kelime setini indirme işlemi yapıyoruz nltk ile:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
#veri temizleme işlemleri
#öncelikle BeautifulSoup modülü ile HTML taglerini review cümlelerinden silme işlemi:
sample_review = df.review[0]
sample_review


'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [73]:
#HTML tag'lerini temizledikten sonra
sample_review = BeautifulSoup(sample_review).get_text()
sample_review

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

In [74]:
#Noktalama işaretleri ve sayıları temizledikten sonra(regex kullanarak)
sample_review= re.sub('[^a-zA-Z]', ' ', sample_review)
sample_review

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for    m

In [75]:
#Küçük harfe dönüştürme işlemi :
sample_review = sample_review.lower()
sample_review

' with all this stuff going down at the moment with mj i ve started listening to his music  watching the odd documentary here and there  watched the wiz and watched moonwalker again  maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  some of it has subtle messages about mj s feeling towards the press and also the obvious message of drugs are bad m kay visually impressive but of course this is all about michael jackson so unless you remotely like mj in anyway then you are going to hate this and find it boring  some may call mj an egotist for consenting to the making of this movie but mj and most of his fans would say that he made it for the fans which if true is really nice of him the actual feature film bit when it finally starts is only on for    m

In [76]:
#Stopwords kelimeleri çıkarma(the, is, are vb)
sample_review = sample_review.split()

In [77]:
sample_review

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again',
 'maybe',
 'i',
 'just',
 'want',
 'to',
 'get',
 'a',
 'certain',
 'insight',
 'into',
 'this',
 'guy',
 'who',
 'i',
 'thought',
 'was',
 'really',
 'cool',
 'in',
 'the',
 'eighties',
 'just',
 'to',
 'maybe',
 'make',
 'up',
 'my',
 'mind',
 'whether',
 'he',
 'is',
 'guilty',
 'or',
 'innocent',
 'moonwalker',
 'is',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'which',
 'i',
 'remember',
 'going',
 'to',
 'see',
 'at',
 'the',
 'cinema',
 'when',
 'it',
 'was',
 'originally',
 'released',
 'some',
 'of',
 'it',
 'has',
 'subtle',
 'messages',
 'about',
 'mj',
 's',
 'feeling',
 'towards',
 'the',
 'press',
 'and',
 'also',
 'the',
 'obvious',
 'message',
 'of',
 'drugs',

In [78]:
len(sample_review)

437

In [79]:
#sample review without stopwords
swords = set(stopwords.words('english'))
sample_review = [w for w in sample_review if not w in swords]
len(sample_review)

219

In [80]:
#Tüm dataframe'imiz içinde bulunan review'leri döngü içinde tamamen temizliyoruz:
def process(review):
    review =BeautifulSoup(review).get_text()
    review =re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    swords = set(stopwords.words('english'))
    review = [w for w in review if not w in swords]
    return(" ".join(review))


In [81]:
train_x_tum = []
for r in range(len(df["review"])):
    if (r+1) % 1000 == 0:
        print("Number of reviews processed =", r+1)
    train_x_tum.append(process(df["review"][r]))

Number of reviews processed = 1000
Number of reviews processed = 2000
Number of reviews processed = 3000
Number of reviews processed = 4000
Number of reviews processed = 5000
Number of reviews processed = 6000
Number of reviews processed = 7000
Number of reviews processed = 8000
Number of reviews processed = 9000
Number of reviews processed = 10000
Number of reviews processed = 11000
Number of reviews processed = 12000
Number of reviews processed = 13000
Number of reviews processed = 14000
Number of reviews processed = 15000
Number of reviews processed = 16000
Number of reviews processed = 17000
Number of reviews processed = 18000
Number of reviews processed = 19000
Number of reviews processed = 20000
Number of reviews processed = 21000
Number of reviews processed = 22000
Number of reviews processed = 23000
Number of reviews processed = 24000
Number of reviews processed = 25000


In [82]:
x = train_x_tum
y = np.array(df["sentiment"])
#train test split
train_x, test_x, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [84]:
vectorizer = CountVectorizer(max_features=5000)
train_x = vectorizer.fit_transform(train_x)
train_x

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1776769 stored elements and shape (22500, 5000)>

In [85]:
train_x = train_x.toarray()

In [87]:
train_y = y_train
train_x.shape, train_y.shape

((22500, 5000), (22500,))

In [88]:
train_y

array([0, 1, 1, ..., 1, 1, 0], shape=(22500,))

In [89]:
#random forest modeli oluşturma ve fit etme
model = RandomForestClassifier(n_estimators=100)
model.fit(train_x, train_y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [90]:
test_xx = vectorizer.transform(test_x)

In [91]:
test_xx

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 198356 stored elements and shape (2500, 5000)>

In [92]:
test_xx = test_xx.toarray()
test_xx.shape

(2500, 5000)

In [93]:
#prediction
test_predict = model.predict(test_xx)
dogruluk = roc_auc_score(y_test, test_predict)
print(dogruluk)

0.848795402863991
