In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, WordNetLemmatizer
import nltk
import re 
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("stock_data.csv", encoding = "ISO-8859-1", engine='python')

In [3]:
data.describe()

Unnamed: 0,Sentiment
count,5791.0
mean,0.272664
std,0.962192
min,-1.0
25%,-1.0
50%,1.0
75%,1.0
max,1.0


In [4]:
data["Sentiment"] = data["Sentiment"].replace(-1,0)

In [5]:
data["Sentiment"].value_counts()

1    3685
0    2106
Name: Sentiment, dtype: int64

### Tr - Veri ön işleme için gerekli aletlerimizi tanımlıyoruz ve tüm veri ön işlemeyi bir for döngüsü ile aradan çıkarıyoruz.

### Eng - We define our tools for data preprocessing and do all data preprocessing in a loop.

In [6]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [7]:
ps = PorterStemmer()
lemma = WordNetLemmatizer()
stopwordSet = set(stopwords.words("english"))

In [8]:
text_reviews = list()
for i in range(len(data)):
    text = re.sub('[^a-zA-Z]'," ",data['Text'][i])
    text = text.lower()
    text = word_tokenize(text,language="english")
    text = [lemma.lemmatize(word) for word in text if(word) not in stopwordSet]
    text = " ".join(text)
    text_reviews.append(text)

#### Tr - Özellik çıkarımı kısmına geldik. Bu kısımda metinlerimizi temsil eden vektörler oluşturacağız ve modellerimizi bu vektörlerimize göre eğiteceğiz. Modelin başarısını etkileyen önemli faktörlerdendir. Ben 2 farklı yöntemin 2'sinide değerlendirdim ve başarılı olmayanı yorum içine aldım.

#### Eng - We're in the feature extraction part. In this section, we will create vectors that represent our texts and train our models according to these vectors. It is one of the important factors affecting the success of the model. I evaluated 2 of the 2 different methods and commented on the unsuccessful one.

Tr - Aslın başarılı sayabileceğimiz 2 modelimiz var. Count vectorizer sadece lojistic regresyon için başarısızdır.

Eng - In fact, we have 2 models that we can consider successful. Count vectorizer fails only for logistic regression.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
 decode_error='ignore')

In [10]:
'''from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary= False, ngram_range = (1,2))
#accurucy maalesef %78'''

'from sklearn.feature_extraction.text import CountVectorizer\nvectorizer = CountVectorizer(binary= False, ngram_range = (1,2))\n#accurucy maalesef %78'

In [11]:
vectorizer.fit(text_reviews)
X = vectorizer.transform(text_reviews)
y= data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.20, random_state = 42)

In [12]:
logreg = LogisticRegression(random_state = 21, solver = "saga", C = 8, penalty = 'l2')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [13]:
print("\n\naccuracy: {}".format(accuracy_score(y_test, y_pred)))



accuracy: 0.7946505608283002


#### Tr - GBM Kullanıyoruz.
#### Eng - We are using GBM.

In [14]:
#from sklearn.ensemble import GradientBoostingClassifier

In [15]:
#clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.50, max_depth = 20)

In [16]:
#clf.fit(X_train, y_train)

In [17]:
#y_pred2 = clf.predict(X_test)

In [18]:
#print("\n\naccuracy: {}".format(accuracy_score(y_test, y_pred2)))

#### Tr - Support Vector Machines Kullanıyoruz.

#### Eng - We are using Support Vector Machines.

In [19]:
from sklearn import svm

In [20]:
svm = svm.SVC(C = 5 , kernel='rbf')

In [21]:
svm.fit(X_train, y_train)

SVC(C=5)

In [22]:
y_pred3 = svm.predict(X_test)

In [23]:
print("\n\naccuracy: {}".format(accuracy_score(y_test, y_pred3)))



accuracy: 0.7963761863675582


#### Tr -  XGBClassifier Kullanıyoruz
#### Eng - We are using XGBClassifier.

In [24]:
from xgboost import XGBClassifier

In [25]:
xgb = XGBClassifier(n_estimator = 500, max_depth = 5, learning_rate = 0.5)

In [26]:
xgb.fit(X_train, y_train)

Parameters: { n_estimator } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimator=500, n_estimators=100, n_jobs=0, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [27]:
y_pred4 = xgb.predict(X_test)

In [28]:
print("\n\naccuracy: {}".format(accuracy_score(y_test, y_pred4)))



accuracy: 0.7679033649698016


#### Tr - LighGBM Classifier Kullanıyoruz.
#### Eng - L

In [29]:
from lightgbm import LGBMClassifier

In [30]:
Y_train = y_train.astype('float32')
Y_test = y_test.astype('float32')
x_train = X_train.astype('float32')
x_test = X_test.astype('float32')

In [31]:
lgb = LGBMClassifier(learning_rate = 0.10, n_estimator = 500, max_depth = 8)

In [32]:
lgb.fit(x_train, Y_train)



LGBMClassifier(max_depth=8, n_estimator=500)

In [33]:
y_pred5 = lgb.predict(x_test)

In [34]:
print("\n\naccuracy: {}".format(accuracy_score(y_test, y_pred5)))



accuracy: 0.7515099223468508


Sonuç: En hızlı ve en yüksek accurcy değerini veren SVM ardından ise lojistik regresyondur. En kötüsü ise yavaşlığından doalyı GBM algoritmasıdır. Ağlattı beni ya.