In [1]:
import os 
import re  
import pandas as pd
from sklearn.metrics import confusion_matrix 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn import metrics 
import nltk



In [2]:
short_pos=pd.read_table("short_reviews/positive.txt",sep="\n", header=None, encoding='latin-1')
short_neg=pd.read_table("short_reviews/negative.txt",sep="\n", header=None,encoding='latin-1')

In [3]:
short_pos.head()

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."


In [4]:
short_pos.shape

(5331, 1)

In [5]:
short_pos.columns =["Reviews"]
short_neg.columns =["Reviews"]

short_pos["Sentiment"]="1"
short_neg["Sentiment"]="0"

In [6]:
x_short_pos=short_pos[:1000]
x_short_neg=short_neg[:1000]

In [7]:
data=pd.concat([x_short_pos, x_short_neg])
data.index=range(len(data.Sentiment))

In [8]:
data.shape

(2000, 2)

In [9]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [10]:
#for example
re.sub("[^a-zA-Z]", " ", "23anb") 

'  anb'

In [11]:
data["Reviews"][1]

'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . '

In [12]:
num_reviews = data["Reviews"].size

# Initialize an empty list to hold the clean reviews
clean_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_reviews.append( review_to_words( data["Reviews"][i] ) )

In [13]:
clean_reviews

['rock destined st century new conan going make splash even greater arnold schwarzenegger jean claud van damme steven segal',
 'gorgeously elaborate continuation lord rings trilogy huge column words cannot adequately describe co writer director peter jackson expanded vision j r r tolkien middle earth',
 'effective tepid biopic',
 'sometimes like go movies fun wasabi good place start',
 'emerges something rare issue movie honest keenly observed feel like one',
 'film provides great insight neurotic mindset comics even reached absolute top game',
 'offers rare combination entertainment education',
 'perhaps picture ever made literally showed road hell paved good intentions',
 'steers turns snappy screenplay curls edges clever want hate somehow pulls',
 'take care cat offers refreshingly different slice asian cinema',
 'film well worth seeing talking singing heads',
 'really surprises wisegirls low key quality genuine tenderness',
 'wendigo go cinema fed eye heart mind',
 'one greatest fa

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3),max_features=800)
data_features = tfidf_vectorizer.fit_transform(clean_reviews)

print(data_features)

  (0, 89)	0.4366057642752368
  (0, 462)	0.3206456878418972
  (0, 293)	0.3911347388292693
  (0, 412)	0.3063280743068435
  (0, 204)	0.2801594004236266
  (0, 740)	0.4366057642752368
  (0, 654)	0.4366057642752368
  (1, 333)	0.3767465853384458
  (1, 107)	0.3854725729338934
  (1, 789)	0.3265191281410121
  (1, 164)	0.25327430712685867
  (1, 502)	0.3767465853384458
  (1, 746)	0.3854725729338934
  (1, 435)	0.36252046200124327
  (1, 790)	0.34170471293346116
  (2, 178)	1.0
  (3, 634)	0.40388880616381234
  (3, 389)	0.24583103973630513
  (3, 291)	0.3657146196634408
  (3, 448)	0.3357380471631521
  (3, 270)	0.3479368725645129
  (3, 295)	0.28486857111725045
  (3, 505)	0.3943141720006519
  (3, 650)	0.4150661109610346
  (4, 389)	0.28844081506118296
  :	:
  (1993, 421)	0.33867271105662883
  (1994, 728)	0.45419730168625533
  (1994, 352)	0.5024353806750297
  (1994, 701)	0.4605853026889391
  (1994, 759)	0.5736938890504248
  (1995, 204)	0.5306611160080862
  (1995, 387)	0.5778154111399058
  (1995, 513)	0.6201

In [16]:
data_features = data_features.todense()

In [17]:
data_features=pd.DataFrame(data_features,columns=tfidf_vectorizer.get_feature_names())

data_features["Sentiment"]=data["Sentiment"]

data_features = data_features.sample(frac =1)

In [18]:
data_features.head(5)

Unnamed: 0,able,absorbing,act,acted,acting,action,actor,actors,actress,actually,...,writing,written,ya,year,year old,years,yes,yet,young,Sentiment
1515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [19]:
train,test = train_test_split(data_features, test_size = 0.2) 

cols = [col for col in data_features.columns if col not in ["Sentiment"]]

train.x= train[cols]
train.y=train["Sentiment"]

test.x=test[cols]
test.y=test["Sentiment"]

  """
  
  
  if __name__ == '__main__':


In [20]:
model_linear = svm.SVC(kernel='linear') 
model_linear.fit(train.x,train.y.astype(int)) 
preds = model_linear.predict(test.x)
confusion_matrix(test.y.astype(int),preds)

accuracy = metrics.accuracy_score(test.y.astype(int), preds)
print(accuracy)

0.6325


In [44]:
allowed_word_types = ["J"]            

def review_to_Pos( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    all_words = []
    
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. tag the reveiws
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            # extracting the adjectives
            all_words.append(w[0])
     
    #4. remove stopwords
    stops = set(stopwords.words("english"))     
    
    all_words = [w for w in all_words if not w in stops] 
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join(all_words))

In [27]:
num_reviews = data["Reviews"].size

# Initialize an empty list to hold the clean reviews
clean_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_reviews.append( review_to_Pos( data["Reviews"][i] ) )

In [45]:
tmp = re.sub("[^a-zA-Z]", " ", data["Reviews"][0]) 
tmp
nltk.pos_tag(tmp.lower().split())

[('the', 'DT'),
 ('rock', 'NN'),
 ('is', 'VBZ'),
 ('destined', 'VBN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('the', 'DT'),
 ('st', 'JJ'),
 ('century', 'NN'),
 ('s', 'VBD'),
 ('new', 'JJ'),
 ('conan', 'NN'),
 ('and', 'CC'),
 ('that', 'IN'),
 ('he', 'PRP'),
 ('s', 'VBZ'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('a', 'DT'),
 ('splash', 'NN'),
 ('even', 'RB'),
 ('greater', 'JJR'),
 ('than', 'IN'),
 ('arnold', 'RB'),
 ('schwarzenegger', 'JJ'),
 ('jean', 'JJ'),
 ('claud', 'NN'),
 ('van', 'NN'),
 ('damme', 'NN'),
 ('or', 'CC'),
 ('steven', 'JJ'),
 ('segal', 'NN')]

In [None]:
Reference:
Tweets Sentiment Use Case:
https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/