In [1]:
import pandas as pd
data = pd.read_csv("data/tweets.csv", header=None, usecols=[1,6])
data.columns = ["sentiment", "text"]
data["sentiment"] = data.sentiment.replace({4:1})
data.head()

Unnamed: 0,sentiment,text
0,0,my arm still hurts from when i pulled it yeste...
1,1,I have so much to do outside! Been looking at ...
2,0,"@AbsolutSara Yes, I knew about the clusterfark..."
3,0,Just woke up and i feel relieved Haha now i ha...
4,1,LOVING the hot weather forecast for the rest o...


In [2]:
X = data.text
y = data.sentiment

X.shape

(30000,)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size=0.1, 
                                                random_state=42)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer()

In [7]:
vectorizer.get_feature_names_out()

array(['00', '000', '000thb', ..., 'ø¹ø', 'ø¹øª', 'ø¹ù'], dtype=object)

In [8]:
len(vectorizer.get_feature_names_out())

37682

In [9]:
X_train_vec = vectorizer.transform(X_train)

In [10]:
X_train_vec

<27000x37682 sparse matrix of type '<class 'numpy.int64'>'
	with 320153 stored elements in Compressed Sparse Row format>

In [11]:
320153/(27000*37682)

0.0003146732795106024

In [12]:
X_train_vec[:10,:10].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [13]:
?CountVectorizer

In [14]:
vectorizer = CountVectorizer(min_df=3)
vectorizer.fit(X_train)
len(vectorizer.get_feature_names_out())

7011

In [15]:
vectorizer = CountVectorizer(min_df=3, max_df=0.1)
vectorizer.fit(X_train)
len(vectorizer.get_feature_names_out())

7000

In [16]:
?CountVectorizer

In [17]:
vectorizer = CountVectorizer(max_features=5000)
vectorizer.fit(X_train)
len(vectorizer.get_feature_names_out())

5000

In [18]:
from sklearn.pipeline import make_pipeline

In [19]:
pipeline = make_pipeline(CountVectorizer(min_df=3), LinearSVC())
pipeline.fit(X_train, y_train)
accuracy_score(y_test, pipeline.predict(X_test))



0.735

Jeśli podczas wywoływania transform pojawią się słowa, które nie pojawiły się podczas fit - zostaną pominięte.

In [20]:
texts = ["Lubię uczenie maszynowe", 
         "Uczenie maszynowe jest najlepsze"]

vectorizer = CountVectorizer(ngram_range=(1,2), min_df=2)
vectorizer.fit(texts)
vectorizer.get_feature_names_out()

array(['maszynowe', 'uczenie', 'uczenie maszynowe'], dtype=object)

In [21]:
texts = ["Lubię uczenie maszynowe <3", 
         "I uczenie maszynowe jest super!!"]

vectorizer = CountVectorizer()
vectorizer.fit(texts)
vectorizer.get_feature_names_out()

array(['jest', 'lubię', 'maszynowe', 'super', 'uczenie'], dtype=object)

Nie ma słowa "I" i liczby "3" - bo to "słowa" jednoznakowe.

In [22]:
?CountVectorizer

In [23]:
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
vectorizer.fit(texts)
vectorizer.get_feature_names_out()

array(['3', 'i', 'jest', 'lubię', 'maszynowe', 'super', 'uczenie'],
      dtype=object)

## Przygotowanie danych tekstowych do modelowania

In [24]:
X_train

346                                 is home for the night 
13028                                   @ThisismyiQ Im up 
8821     @vickycornell and by the way, your damn right....
25676    3 boys asleep and oldest not home from school ...
7534     @MelFresh27 I'm going to try the servo across ...
                               ...                        
29802             portobello road and wimbledon for a bbq 
5390     @Shannenp it is absolutely genius!  i should s...
860      @piginthepoke later gator, will keep you updat...
15795    @aaronh Interesting! No, I didn't know. It's a...
23654    @Wossy I'm listening too, it's not the same pr...
Name: text, Length: 27000, dtype: object

In [25]:
import re

In [26]:
x = "@JessMcFlyxxx haha you ok?"

In [27]:
re.sub("@\w+", " ", x)

'  haha you ok?'

In [28]:
X_train = X_train.apply(lambda x: re.sub("@\w+", " ", x))
X_test = X_test.apply(lambda x: re.sub("@\w+", " ", x))

In [29]:
list(X_train[20:30])

["Sleeping alone tonight cause Crystal went to Josie's house. ",
 '  There probably are a few down in the depths, I sometimes wonder what I would do if I hauled one up ',
 "  she woke up this morning when I was moving the wheel down and she was making these squeeky noises  she's mad at me still",
 "Going to bedd soon. I am sooooooo tired  gonna fall asleep to the sound of &quot;You're the one that I want the one that I want ooh ooh ooh ya&quot;",
 "migraines suck   I hope truffles don't make them worse ;)",
 "Today wasn't so bad  I started reading just listen and today I decided that I'm going to iron my bangs since I havent done so in like days",
 'Today was just one long hang over ',
 '  receptionist! hehe ',
 "  kriiiiista, please say ur alive, not seen u in almost a week! I'm going to be shit at coping when u go to Uni  xxx",
 'going to bed tomorrow long day first school then working... ']

In [30]:
x = "ooh ya&quot;"
x.replace("&quot;", " ")

'ooh ya '

In [31]:
X_train = X_train.apply(lambda x: x.replace("&quot;", " "))
X_test = X_test.apply(lambda x: x.replace("&quot;", " "))

Czyszczenie tektów polega na usunięciu zawartości, której nie chcemy uwzględniać w modelowaniu.

Konkrety zależą od celu działania algorytmu.

W życiu zazwyczaj trzeba czyścić i często jest to duża część pracy. 