In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
# Need cp1252 - Windows Western Europe encoding here
df = pd.read_csv('../kaggleData/imdb-review-dataset/imdb_master.csv', index_col=0, encoding='cp1252')

In [4]:
df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [5]:
x_train = df.loc[df['type']=='train', 'review']
y_train = df.loc[df['type']=='train', 'label']
x_test = df.loc[df['type']=='test', 'review']
y_test = df.loc[df['type']=='test', 'label']

In [6]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(75000,) (75000,)
(25000,) (25000,)


In [7]:
print(x_train.head())
print(y_train.head())
print(x_test.head())
# Test Set = 0-25000
# Train Set = 25000-

25000    Story of a man who has unnatural feelings for ...
25001    Airport '77 starts as a brand new luxury 747 p...
25002    This film lacked something I couldn't put my f...
25003    Sorry everyone,,, I know this is supposed to b...
25004    When I was little my parents took me along to ...
Name: review, dtype: object
25000    neg
25001    neg
25002    neg
25003    neg
25004    neg
Name: label, dtype: object
0    Once again Mr. Costner has dragged out a movie...
1    This is an example of why the majority of acti...
2    First of all I hate those moronic rappers, who...
3    Not even the Beatles could write songs everyon...
4    Brass pictures (movies is not a fitting word f...
Name: review, dtype: object


## CountVectorizer

In [8]:
vect = CountVectorizer(ngram_range=(1,2))

In [9]:
countVects = vect.fit_transform(x_train[0:10])

In [10]:
countVects.shape

(10, 3150)

In [11]:
countVects.toarray()

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
vect.get_feature_names()[0:10]

['10',
 '10 years',
 '108',
 '108 minutes',
 '108 odd',
 '1970',
 '1970 was',
 '1972',
 '1972 submerged',
 '1977']

In [13]:
vect.transform(['Something new and not seen before']).toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Tfidf Vectorizer

In [14]:
vect = TfidfVectorizer()

In [15]:
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [16]:
tfVects = vect.fit_transform(x_train[0:10])

In [17]:
tfVects.toarray()[0]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.2215105 ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.05398864,  0.04485126,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

## 20 Newsgroups

In [18]:
from sklearn.datasets import fetch_20newsgroups

In [26]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [27]:
twenty_train = fetch_20newsgroups(subset='train',shuffle=True, categories=categories, random_state=42)

In [30]:
np.unique(twenty_train.target)

array([0, 1, 2, 3], dtype=int64)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [32]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [43]:
len(count_vect.vocabulary_)

35788

In [45]:
X_train_counts.shape

(2257, 35788)

In [44]:
len(X_train_counts.toarray()[0])

35788

In [46]:
twenty_train.target.shape

(2257,)

In [22]:
clf=LogisticRegression(C=100)

In [None]:
clf.fit(twenty_train)