In [52]:
import nltk
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

In [53]:
data = pd.read_csv('./twitter_training.csv') 
test = pd.read_csv('./twitter_validation.csv')

In [54]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [56]:
test.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [57]:
data.shape

(74681, 4)

In [58]:
test.shape

(999, 4)

In [59]:
data.columns=['number','borderlands','sentament','text']
test.columns=['number','borderlands','sentament','text']


In [60]:
data.tail()

Unnamed: 0,number,borderlands,sentament,text
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...
74680,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [61]:
data.isna().sum()

number           0
borderlands      0
sentament        0
text           686
dtype: int64

In [62]:
test.isna().sum()

number         0
borderlands    0
sentament      0
text           0
dtype: int64

In [63]:
data.dropna(inplace=True)

In [64]:
data.describe()

Unnamed: 0,number
count,73995.0
mean,6430.333685
std,3737.655932
min,1.0
25%,3194.0
50%,6418.0
75%,9595.0
max,13200.0


In [65]:
test.describe()

Unnamed: 0,number
count,999.0
mean,6435.159159
std,3728.912226
min,6.0
25%,3241.5
50%,6560.0
75%,9662.5
max,13197.0


In [66]:
df = data.drop(['number','borderlands'],axis=1)
ts = test.drop(['number','borderlands'],axis=1)

In [67]:
df.head()

Unnamed: 0,sentament,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [68]:
ts.head()

Unnamed: 0,sentament,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [69]:
# df['text'] = df['text'].str.lower()
# ts['text'] = ts['text'].str.lower()
# df['text'] = df['text'].apply(lambda x:x.split(" "))
# ts['text'] = ts['text'].apply(lambda x:x.split(" "))
# stop_words = set(stopwords.words('english'))
# df['text'] = df['text'].apply(lambda words: [word for word in words if word not in stop_words])
# ts['text'] = ts['text'].apply(lambda words: [word for word in words if word not in stop_words])
# stemmer = PorterStemmer()

# df['text']= df['text'].apply(lambda words: [stemmer.stem(token) for token in words])
# ts['text']= ts['text'].apply(lambda words: [stemmer.stem(token) for token in words])
# cut_words = [",","/","@","-","'","%","!","?"]
# df['text'] = df['text'].apply(lambda word: ' '.join(char for char in word if char not in cut_words))
# ts['text'] = ts['text'].apply(lambda word: ' '.join(char for char in word if char not in cut_words))

In [70]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
cut_words = [",", "/", "@", "-", "'", "%", "!", "?",","]

In [84]:
def preprocess_text(series):
    series = series.str.lower()
    # print(series[:1])
    series = series.apply(lambda x: x.split())
    # print(series[:1])
    series = series.apply(lambda words: [word for word in words if word not in stop_words])
    # print(series[:1])
    series = series.apply(lambda words: [stemmer.stem(token) for token in words])
    # print(series[:1])
    series = series.apply(lambda words: ' '.join(char for char in words if char not in cut_words))
    # print(series[:1])
    return series

In [72]:
text_prepro = Pipeline([
    ('lower_text',FunctionTransformer(preprocess_text)),
    ('vectorize', CountVectorizer(ngram_range=(2,3))),
])

In [73]:
type(df['text'])

pandas.core.series.Series

In [74]:
x_train = text_prepro.fit_transform(df['text'])
x_test = text_prepro.transform(ts['text'])


0    i am coming to the borders and i will kill you...
Name: text, dtype: object
0    [i, am, coming, to, the, borders, and, i, will...
Name: text, dtype: object
0    [coming, borders, kill, all,]
Name: text, dtype: object
0    [come, border, kill, all,]
Name: text, dtype: object
0    come border kill all,
Name: text, dtype: object
0    bbc news - amazon boss jeff bezos rejects clai...
Name: text, dtype: object
0    [bbc, news, -, amazon, boss, jeff, bezos, reje...
Name: text, dtype: object
0    [bbc, news, -, amazon, boss, jeff, bezos, reje...
Name: text, dtype: object
0    [bbc, news, -, amazon, boss, jeff, bezo, rejec...
Name: text, dtype: object
0    bbc news amazon boss jeff bezo reject claim co...
Name: text, dtype: object


In [75]:
model = LogisticRegression()
encoder = LabelEncoder()

In [107]:
x_train[0].shape

(1, 664198)

In [76]:
y_train = encoder.fit_transform(df['sentament'])
y_test = encoder.fit_transform(ts['sentament'])

In [77]:
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
y_pred = model.predict(x_test)

In [79]:
accuracy = accuracy_score(y_test,y_pred)

In [80]:
accuracy

0.963963963963964

In [132]:
# Example text
text = "i really hated"
text = pd.Series([text])
# Apply the preprocessing pipeline to the text
preprocessed_text = text_prepro.transform(text)

# Convert the sparse matrix to a dense array for display
preprocessed_text_array = preprocessed_text.toarray().reshape(1,-1)

print(preprocessed_text_array.shape)

0    i really hated
dtype: object
0    [i, really, hated]
dtype: object
0    [really, hated]
dtype: object
0    [realli, hate]
dtype: object
0    realli hate
dtype: object
(1, 664198)


In [133]:
encoder.inverse_transform(model.predict(preprocessed_text_array))

array(['Negative'], dtype=object)