In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np

from pyisax import PyiSA

In [37]:
def make_dtm_from_texts(texts):
    stemmer = PorterStemmer()
    analyzer = CountVectorizer(stop_words='english').build_analyzer()

    def stemmed(doc):
        return (stemmer.stem(w) for w in analyzer(doc))

    stem_vectorizer = CountVectorizer(analyzer=stemmed, min_df=0.005)

    X = stem_vectorizer.fit_transform(texts)

    return X

In [38]:
with open('Trump.csv', newline='') as trump_file:
    trump_data = pd.read_csv(trump_file)
    
print('Tweet data loaded. Creating term-document matrix ({} tweets).'.format(len(trump_data['text'])))
X = make_dtm_from_texts(trump_data['text'])

X

Tweet data loaded. Creating term-document matrix (28741 tweets).


<28741x368 sparse matrix of type '<class 'numpy.int64'>'
	with 229735 stored elements in Compressed Sparse Row format>

In [39]:
X = PyiSA.prep_data(X)
y = trump_data['Sentiment']

In [40]:
X_train = [X[i] for i in range(0, len(y)) if not pd.isnull(y[i])]
X_test = [X[i] for i in range(0, len(y)) if pd.isnull(y[i])]
y_train = [y[i] for i in range(0, len(y)) if not pd.isnull(y[i])]

In [41]:
my_isa = PyiSA(verbose=True)
my_isa.fit(X_train, X_test, y_train)

Commencing iSA run (verbose mode enabled)...
Feature Space: 92 features X 28741 documents (482 training, 28259 test)
Augmenting dataset using 19 splits...
Training iSA for 4 categories.
Absolute determinant of (P'*P): 2.6967745280567908e-11
Bootstrapping... (1000 passes)
[DONE]   Execution time: 3.1970088481903076


In [42]:
my_isa.best_table

Unnamed: 0_level_0,Estimate,Std. Error,z value,Pr(>|z|)
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,0.455594,0.027458,16.592507,7.895406e-62
neutral,0.329418,0.031886,10.331146,5.0948080000000005e-25
offTopic,0.029749,0.01359,2.18903,0.02859467
positive,0.185239,0.033039,5.606743,2.061695e-08


In [30]:
y

0         neutral
1        positive
2        positive
3         neutral
4        negative
5        negative
6        negative
7        positive
8        negative
9         neutral
10       negative
11       negative
12       negative
13       negative
14       positive
15       positive
16       positive
17        neutral
18       negative
19        neutral
20       negative
21       negative
22       negative
23       positive
24       negative
25        neutral
26       positive
27        neutral
28       positive
29        neutral
           ...   
28711         NaN
28712         NaN
28713         NaN
28714         NaN
28715         NaN
28716         NaN
28717         NaN
28718         NaN
28719         NaN
28720         NaN
28721         NaN
28722         NaN
28723         NaN
28724         NaN
28725         NaN
28726         NaN
28727         NaN
28728         NaN
28729         NaN
28730         NaN
28731         NaN
28732         NaN
28733         NaN
28734         NaN
28735     

In [43]:
the_X = make_dtm_from_texts(trump_data['text'])

In [44]:
the_X.shape

(28741, 368)