In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('./twitter_training.csv') 
test = pd.read_csv('./twitter_validation.csv')

In [3]:
data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [4]:
test.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [5]:
data.shape

(74681, 4)

In [6]:
test.shape

(999, 4)

In [7]:
data.columns=['number','borderlands','sentament','text']
test.columns=['number','borderlands','sentament','text']


In [8]:
data.tail()

Unnamed: 0,number,borderlands,sentament,text
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...
74680,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [9]:
data.isna().sum()

number           0
borderlands      0
sentament        0
text           686
dtype: int64

In [10]:
test.isna().sum()

number         0
borderlands    0
sentament      0
text           0
dtype: int64

In [11]:
data.dropna(inplace=True)

In [12]:
data.describe()

Unnamed: 0,number
count,73995.0
mean,6430.333685
std,3737.655932
min,1.0
25%,3194.0
50%,6418.0
75%,9595.0
max,13200.0


In [13]:
test.describe()

Unnamed: 0,number
count,999.0
mean,6435.159159
std,3728.912226
min,6.0
25%,3241.5
50%,6560.0
75%,9662.5
max,13197.0


In [14]:
df = data.drop(['number','borderlands'],axis=1)
ts = test.drop(['number','borderlands'],axis=1)

In [15]:
df.head()

Unnamed: 0,sentament,text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [16]:
ts.head()

Unnamed: 0,sentament,text
0,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,Negative,@Microsoft Why do I pay for WORD when it funct...
2,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,Neutral,Now the President is slapping Americans in the...
4,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [17]:
df['text'] = df['text'].str.lower()
ts['text'] = ts['text'].str.lower()

In [18]:
df.head()

Unnamed: 0,sentament,text
0,Positive,i am coming to the borders and i will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [19]:
ts.head()

Unnamed: 0,sentament,text
0,Neutral,bbc news - amazon boss jeff bezos rejects clai...
1,Negative,@microsoft why do i pay for word when it funct...
2,Negative,"csgo matchmaking is so full of closet hacking,..."
3,Neutral,now the president is slapping americans in the...
4,Negative,hi @eahelp i’ve had madeleine mccann in my cel...


In [20]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# df['text'] = df['text'].apply(lambda x:x.replace("  "," "))

In [22]:
df['text'] = df['text'].apply(lambda x:x.split(" "))
ts['text'] = ts['text'].apply(lambda x:x.split(" "))

In [23]:
df['text'].head(100)

0      [i, am, coming, to, the, borders, and, i, will...
1      [im, getting, on, borderlands, and, i, will, k...
2      [im, coming, on, borderlands, and, i, will, mu...
3      [im, getting, on, borderlands, 2, and, i, will...
4      [im, getting, into, borderlands, and, i, can, ...
                             ...                        
96     [grounded, looked, cool, despite, the, borderl...
97     [grosskreutz, looked, pretty, cool,, even, des...
98     [grounded, almost, looked, pretty, cool, here,...
99     [deep, grounded, almost, looked, pretty, cool,...
100    [grounded, almost, was, pretty, cool, even, de...
Name: text, Length: 100, dtype: object

In [24]:
ts['text'].head(100)

0     [bbc, news, -, amazon, boss, jeff, bezos, reje...
1     [@microsoft, why, do, i, pay, for, word, when,...
2     [csgo, matchmaking, is, so, full, of, closet, ...
3     [now, the, president, is, slapping, americans,...
4     [hi, @eahelp, i’ve, had, madeleine, mccann, in...
                            ...                        
95    [@verizon, can, you, waive, some, data, overag...
96    [no, one, buy, battlefield, 3, on, steam!, it,...
97    [our, #hisaperth, #obiawards, ceremony, is, ta...
98    [#indigo, urgent, care, looks, to, microsoft, ...
99    [🤔, not, sure, where, all, your, data, is, goi...
Name: text, Length: 100, dtype: object

In [25]:
stop_words = set(stopwords.words('english'))

In [26]:
df['text'] = df['text'].apply(lambda words: [word for word in words if word not in stop_words])
ts['text'] = ts['text'].apply(lambda words: [word for word in words if word not in stop_words])

In [27]:
stemmer = PorterStemmer()


In [28]:
df['text']

0                            [coming, borders, kill, all,]
1                   [im, getting, borderlands, kill, all,]
2                  [im, coming, borderlands, murder, all,]
3              [im, getting, borderlands, 2, murder, all,]
4                 [im, getting, borderlands, murder, all,]
                               ...                        
74676    [realized, windows, partition, mac, like, 6, y...
74677    [realized, mac, window, partition, 6, years, b...
74678    [realized, windows, partition, mac, 6, years, ...
74679    [realized, windows, partition, mac, like, 6, y...
74680    [like, windows, partition, mac, like, 6, years...
Name: text, Length: 73995, dtype: object

In [29]:
df['text']= df['text'].apply(lambda words: [stemmer.stem(token) for token in words])
ts['text']= ts['text'].apply(lambda words: [stemmer.stem(token) for token in words])

In [30]:
df['text']

0                               [come, border, kill, all,]
1                        [im, get, borderland, kill, all,]
2                     [im, come, borderland, murder, all,]
3                   [im, get, borderland, 2, murder, all,]
4                      [im, get, borderland, murder, all,]
                               ...                        
74676    [realiz, window, partit, mac, like, 6, year, b...
74677    [realiz, mac, window, partit, 6, year, behind,...
74678    [realiz, window, partit, mac, 6, year, behind,...
74679    [realiz, window, partit, mac, like, 6, year, b...
74680    [like, window, partit, mac, like, 6, year, beh...
Name: text, Length: 73995, dtype: object

In [31]:
ts['text']

0      [bbc, news, -, amazon, boss, jeff, bezo, rejec...
1      [@microsoft, pay, word, function, poorli, @sam...
2      [csgo, matchmak, full, closet, hacking,, truli...
3      [presid, slap, american, face, realli, commit,...
4      [hi, @eahelp, i’v, madelein, mccann, cellar, p...
                             ...                        
994    [⭐️, toronto, art, cultur, capit, canada,, it’...
995    [actual, good, move, tot, bring, viewers.\n\ni...
996    [today, suck, it’, time, drink, wine, n, play,...
997    [bought, fraction, microsoft, today., small, w...
998    [johnson, &, johnson, stop, sell, talc, babi, ...
Name: text, Length: 999, dtype: object

In [32]:
cut_words = [",","/","@","-","'","%","!","?"]
df['text'] = df['text'].apply(lambda word: ' '.join(char for char in word if char not in cut_words))
ts['text'] = ts['text'].apply(lambda word: ' '.join(char for char in word if char not in cut_words))

In [33]:
print(df['text'][:5])
print(ts['text'][:5])

0              come border kill all,
1        im get borderland kill all,
2     im come borderland murder all,
3    im get borderland 2 murder all,
4      im get borderland murder all,
Name: text, dtype: object
0    bbc news amazon boss jeff bezo reject claim co...
1    @microsoft pay word function poorli @samsungu ...
2    csgo matchmak full closet hacking, truli aw game.
3    presid slap american face realli commit unlaw ...
4    hi @eahelp i’v madelein mccann cellar past 13 ...
Name: text, dtype: object


In [34]:
ts.head()

Unnamed: 0,sentament,text
0,Neutral,bbc news amazon boss jeff bezo reject claim co...
1,Negative,@microsoft pay word function poorli @samsungu ...
2,Negative,"csgo matchmak full closet hacking, truli aw game."
3,Neutral,presid slap american face realli commit unlaw ...
4,Negative,hi @eahelp i’v madelein mccann cellar past 13 ...


In [35]:
df.head()

Unnamed: 0,sentament,text
0,Positive,"come border kill all,"
1,Positive,"im get borderland kill all,"
2,Positive,"im come borderland murder all,"
3,Positive,"im get borderland 2 murder all,"
4,Positive,"im get borderland murder all,"


In [36]:
vectorizer = CountVectorizer(ngram_range=(1,2))


In [37]:
x_train = vectorizer.fit_transform(df['text'])

# Transform the test data using the same vectorizer
x_test = vectorizer.transform(ts['text'])

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [39]:
model = LogisticRegression()
encoder = LabelEncoder()

In [40]:
y_train = encoder.fit_transform(df['sentament'])
y_test = encoder.fit_transform(ts['sentament'])

In [41]:
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
y_pred = model.predict(x_test)

In [43]:
accuracy = accuracy_score(y_test,y_pred)

In [44]:
accuracy

0.9719719719719719