<a href="https://colab.research.google.com/github/sprerak48/Autonomous-tagging-using-Deep-Learning/blob/Prerak/AmazonReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [0]:
# load the dataset
data = open('corpus').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [0]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
print(train_y)
print(valid_y)

[1 0 0 ... 0 1 1]
[1 1 0 ... 1 1 1]


In [0]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
print(xtrain_count, xvalid_count)

  (0, 790)	4
  (0, 1245)	1
  (0, 1249)	1
  (0, 1743)	3
  (0, 1946)	1
  (0, 2412)	1
  (0, 2566)	1
  (0, 2758)	1
  (0, 3034)	1
  (0, 3129)	1
  (0, 3771)	1
  (0, 4400)	1
  (0, 5094)	1
  (0, 6104)	1
  (0, 7228)	2
  (0, 9535)	1
  (0, 10215)	1
  (0, 10394)	1
  (0, 10881)	2
  (0, 10906)	1
  (0, 11067)	1
  (0, 11406)	1
  (0, 12908)	2
  (0, 12932)	3
  (0, 13181)	3
  :	:
  (7499, 19877)	1
  (7499, 21251)	1
  (7499, 21875)	1
  (7499, 22418)	1
  (7499, 23867)	1
  (7499, 25014)	1
  (7499, 26077)	1
  (7499, 27218)	1
  (7499, 27687)	1
  (7499, 28061)	1
  (7499, 28076)	1
  (7499, 28082)	8
  (7499, 28224)	1
  (7499, 28493)	3
  (7499, 28954)	1
  (7499, 28997)	1
  (7499, 30179)	4
  (7499, 30485)	1
  (7499, 30607)	2
  (7499, 30732)	1
  (7499, 30851)	2
  (7499, 31001)	1
  (7499, 31236)	1
  (7499, 31483)	1
  (7499, 31506)	1   (0, 82)	1
  (0, 696)	1
  (0, 710)	2
  (0, 815)	1
  (0, 1015)	1
  (0, 1265)	1
  (0, 1488)	1
  (0, 1543)	1
  (0, 1563)	1
  (0, 1743)	5
  (0, 2015)	1
  (0, 2136)	5
  (0, 3100)	1
  (0, 350

In [0]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
print(xtrain_tfidf)
print(xvalid_tfidf)

  (0, 4907)	0.03821379650691435
  (0, 4861)	0.05524906355291284
  (0, 4855)	0.10732582765182606
  (0, 4813)	0.09153062860060426
  (0, 4809)	0.03704433867033101
  (0, 4506)	0.15059794125767
  (0, 4497)	0.07932314653104082
  (0, 4493)	0.05501112234405734
  (0, 4458)	0.024955097476835744
  (0, 4442)	0.055550589068411795
  (0, 4437)	0.06815344789187752
  (0, 4428)	0.10865450523353583
  (0, 4426)	0.06765936575471296
  (0, 4338)	0.039749194653400974
  (0, 4307)	0.08407804396501607
  (0, 4218)	0.06438941276639269
  (0, 4175)	0.09727396614597951
  (0, 4090)	0.07688118016546719
  (0, 4085)	0.059286498661877074
  (0, 3977)	0.20958420356750143
  (0, 3944)	0.14307824017981496
  (0, 3810)	0.07358447864499464
  (0, 3784)	0.08541514398539488
  (0, 3450)	0.13938558437176418
  (0, 3119)	0.09334806759711325
  :	:
  (7499, 1958)	0.08205331491793076
  (7499, 1886)	0.46617437881785057
  (7499, 1867)	0.25984671413069044
  (7499, 1861)	0.129507828737475
  (7499, 1857)	0.053417992237341554
  (7499, 1811)	0.10

In [0]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)


In [0]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [13]:
from google.colab import files
uploaded = files.upload()

KeyboardInterrupt: ignored