In [None]:
import nltk

# Tokenization in NLP

In [None]:
nltk.download('stopwords')
nltk.download("punkt")
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
paragraph = """India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[26] is a country in South Asia. It is the seventh-largest country by area, 
            the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, 
            and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and,
            Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; 
            its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar and Indonesia,
            Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.[27][28][29] Their long occupation, 
            initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse, second only to Africa in human 
            genetic diversity.[30] Settled life emerged on the subcontinent in the western margins of the Indus river basin 9,000 years ago, 
            evolving gradually into the Indus Valley Civilisation of the third millennium BCE.[31] By 1200 BCE, an archaic form of Sanskrit, 
            an Indo-European language, had diffused into India from the northwest,[32][33] unfolding as the language of the Rigveda, 
            and recording the dawning of Hinduism in India.[34] The Dravidian languages of India were supplanted in the northern and 
            western regions.[35] By 400 BCE, stratification and exclusion by caste had emerged within Hinduism,[36] and Buddhism and 
            Jainism had arisen, proclaiming social orders unlinked to heredity.[37] Early political consolidations gave rise to the 
            loose-knit Maurya and Gupta Empires based in the Ganges Basin.[38] Their collective era was suffused with wide-ranging 
            creativity,[39] but also marked by the declining status of women,[40] and the incorporation of untouchability into an organised system of belief.
            [g][41] In South India, the Middle kingdoms exported Dravidian-languages scripts and religious cultures to the kingdoms of Southeast Asia.[42]"""

In [None]:
# converting paragraph into sentences

sentences = nltk.sent_tokenize(paragraph)
print(len(sentences))
sentences

13


['India, officially the Republic of India (Hindi: Bhārat Gaṇarājya),[26] is a country in South Asia.',
 'It is the seventh-largest country by area, \n            the second-most populous country, and the most populous democracy in the world.',
 'Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, \n            and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and,\n            Myanmar to the east.',
 'In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; \n            its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar and Indonesia,\n            Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.',
 '[27][28][29] Their long occupation, \n            initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse, second only to Africa in human \

In [None]:
# converting paragraph into words

words = nltk.word_tokenize(paragraph)
print(len(words))
words

# Stemming and lemmatization 

In [None]:
# stemming ( keep the common for words, used in sentiment analysis models)
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
stemmer = PorterStemmer()

In [None]:
for i in range(len(sentences)):
  tmp_word = nltk.word_tokenize(sentences[i])

  tmp_word = [stemmer.stem(word) for word in tmp_word if word not in set(stopwords.words('english'))]
  sentences[i] = " ".join(tmp_word)

In [None]:
sentences

['india , offici republ india ( hindi : bhārat gaṇarājya ) , [ 26 ] countri south asia .',
 'It seventh-largest countri area , second-most popul countri , popul democraci world .',
 'bound indian ocean south , arabian sea southwest , bay bengal southeast , share land border pakistan west ; [ f ] china , nepal , bhutan north ; bangladesh , myanmar east .',
 'In indian ocean , india vicin sri lanka maldiv ; andaman nicobar island share maritim border thailand , myanmar indonesia , modern human arriv indian subcontin africa later 55,000 year ago .',
 '[ 27 ] [ 28 ] [ 29 ] their long occup , initi vari form isol hunter-gather , made region highli divers , second africa human genet divers .',
 '[ 30 ] settl life emerg subcontin western margin indu river basin 9,000 year ago , evolv gradual indu valley civilis third millennium bce .',
 '[ 31 ] By 1200 bce , archaic form sanskrit , indo-european languag , diffus india northwest , [ 32 ] [ 33 ] unfold languag rigveda , record dawn hinduism ind

In [None]:
# Lemetizer ( keep the meaning of word, used in chatbot)
from nltk.stem import WordNetLemmatizer

In [None]:
lemma = WordNetLemmatizer()

In [None]:
for i in range(len(sentences)):
  tmp_word = nltk.word_tokenize(sentences[i])
  tmp_word = [lemma.lemmatize(word) for word in tmp_word if word not in set(stopwords.words('english'))]
  sentences[i] = " ".join(tmp_word)

In [None]:
sentences

['India , officially Republic India ( Hindi : Bhārat Gaṇarājya ) , [ 26 ] country South Asia .',
 'It seventh-largest country area , second-most populous country , populous democracy world .',
 'Bounded Indian Ocean south , Arabian Sea southwest , Bay Bengal southeast , share land border Pakistan west ; [ f ] China , Nepal , Bhutan north ; Bangladesh , Myanmar east .',
 'In Indian Ocean , India vicinity Sri Lanka Maldives ; Andaman Nicobar Islands share maritime border Thailand , Myanmar Indonesia , Modern human arrived Indian subcontinent Africa later 55,000 year ago .',
 '[ 27 ] [ 28 ] [ 29 ] Their long occupation , initially varying form isolation hunter-gatherer , made region highly diverse , second Africa human genetic diversity .',
 '[ 30 ] Settled life emerged subcontinent western margin Indus river basin 9,000 year ago , evolving gradually Indus Valley Civilisation third millennium BCE .',
 '[ 31 ] By 1200 BCE , archaic form Sanskrit , Indo-European language , diffused India no

# Bag of words with lemmitazation

In [None]:
from nltk.internals import read_int
import re

corpus = []
for i in range(len(sentences)):
  # taking only alphabets
  review = re.sub('[^a-zA-Z]',' ', sentences[i])
  review = review.lower()
  review = review.split()
  review = [lemma.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
  review = " ".join(review)
  corpus.append(review)

NameError: ignored

In [None]:
corpus

In [None]:
# Creating bag of words

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
print(len(x))
x

13


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()
x = cv.fit_transform(corpus).toarray()
print(len(x))
x

# Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:

text = re.sub(r"\[[0-9]*\]", " ", paragraph)
# removing special character
text = re.sub(r"\s+", " ", text)
# converting all in lower
text = text.lower()
text = re.sub(r"\d", " ", text)
text = re.sub(r"\s+", " ", text)
text

'india, officially the republic of india (hindi: bhārat gaṇarājya), is a country in south asia. it is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. bounded by the indian ocean on the south, the arabian sea on the southwest, and the bay of bengal on the southeast, it shares land borders with pakistan to the west;[f] china, nepal, and bhutan to the north; and bangladesh and, myanmar to the east. in the indian ocean, india is in the vicinity of sri lanka and the maldives; its andaman and nicobar islands share a maritime border with thailand, myanmar and indonesia, modern humans arrived on the indian subcontinent from africa no later than , years ago. their long occupation, initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse, second only to africa in human genetic diversity. settled life emerged on the subcontinent in the western margins of the indus river basin , years ago, e

In [None]:
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences ]

In [None]:
for i in range(len(sentences)):
  sentences[i] = [word for word in sentences[i] if word not in set(stopwords.words('english'))]

In [None]:
sentences

In [None]:
model = Word2Vec(sentences, min_count=1)

In [None]:
words = model.wv.vocab
words

In [None]:
vector = model.wv['india']
len(vector)

100

In [None]:
similar = model.wv.most_similar('asia')
similar

[('country', 0.24626584351062775),
 ('populous', 0.2228507101535797),
 ('g', 0.22104939818382263),
 ('consolidations', 0.21067361533641815),
 ('southwest', 0.1958770900964737),
 ('dravidian-languages', 0.18834611773490906),
 ('within', 0.18729442358016968),
 (':', 0.18541505932807922),
 ('dravidian', 0.16651472449302673),
 ('hindi', 0.16215012967586517)]

# Spam Detection

In [None]:
import pandas as pd

In [None]:
message = pd.read_csv('SMSSpamCollection', sep='\t', names=["labels", "message"])

In [None]:
message

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
ps = PorterStemmer()
corpus = []
for i in range(len(message)):
  review = re.sub("[^a-zA-z]", " ", message.iloc[i][1])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
  review = " ".join(review)
  corpus.append(review)

In [None]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(message.iloc[:,0])

In [None]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y = y.iloc[:,1]
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: spam, Length: 5572, dtype: uint8

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
# train model using Naive bayer claassifier

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(x_train, y_train)

In [None]:
pre = model.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, pre)

In [None]:
cm

array([[956,  10],
       [  7, 142]])

In [None]:
accuracy_score(y_test, pre)

0.9847533632286996

# Stock market sentiment analysis

In [None]:
import pandas as pd
data = pd.read_csv("Data.csv", encoding = "ISO-8859-1")
data

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite
2,2000-01-05,0,Coventry caught on counter by Flo,United's rivals on the road to Rio,Thatcher issues defence before trial by video,Police help Smith lay down the law at Everton,Tale of Trautmann bears two more retellings,England on the rack,Pakistan retaliate with call for video of Walsh,Cullinan continues his Cape monopoly,...,South Melbourne (Australia),Necaxa (Mexico),Real Madrid (Spain),Raja Casablanca (Morocco),Corinthians (Brazil),Tony's pet project,Al Nassr (Saudi Arabia),Ideal Holmes show,Pinochet leaves hospital after tests,Useful links
3,2000-01-06,1,Pilgrim knows how to progress,Thatcher facing ban,McIlroy calls for Irish fighting spirit,Leicester bin stadium blueprint,United braced for Mexican wave,"Auntie back in fashion, even if the dress look...",Shoaib appeal goes to the top,Hussain hurt by 'shambles' but lays blame on e...,...,Putin admits Yeltsin quit to give him a head s...,BBC worst hit as digital TV begins to bite,How much can you pay for...,Christmas glitches,"Upending a table, Chopping a line and Scoring ...","Scientific evidence 'unreliable', defence claims",Fusco wins judicial review in extradition case,Rebels thwart Russian advance,Blair orders shake-up of failing NHS,Lessons of law's hard heart
4,2000-01-07,1,Hitches and Horlocks,Beckham off but United survive,Breast cancer screening,Alan Parker,Guardian readers: are you all whingers?,Hollywood Beyond,Ashes and diamonds,Whingers - a formidable minority,...,Most everywhere: UDIs,Most wanted: Chloe lunettes,Return of the cane 'completely off the agenda',From Sleepy Hollow to Greeneland,Blunkett outlines vision for over 11s,"Embattled Dobson attacks 'play now, pay later'...",Doom and the Dome,What is the north-south divide?,Aitken released from jail,Gone aloft
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4096,2016-06-27,0,Barclays and RBS shares suspended from trading...,Pope says Church should ask forgiveness from g...,Poland 'shocked' by xenophobic abuse of Poles ...,"There will be no second referendum, cabinet ag...","Scotland welcome to join EU, Merkel ally says",Sterling dips below Friday's 31-year low amid ...,No negative news about South African President...,Surge in Hate Crimes in the U.K. Following U.K...,...,German lawyers to probe Erdogan over alleged w...,"Boris Johnson says the UK will continue to ""in...",Richard Branson is calling on the UK governmen...,Turkey 'sorry for downing Russian jet',Edward Snowden lawyer vows new push for pardon...,Brexit opinion poll reveals majority don't wan...,"Conservative MP Leave Campaigner: ""The leave c...","Economists predict UK recession, further weake...","New EU 'superstate plan by France, Germany: Cr...",Pakistani clerics declare transgender marriage...
4097,2016-06-28,1,"2,500 Scientists To Australia: If You Want To ...","The personal details of 112,000 French police ...",S&amp;P cuts United Kingdom sovereign credit r...,Huge helium deposit found in Africa,CEO of the South African state broadcaster qui...,"Brexit cost investors $2 trillion, the worst o...",Hong Kong democracy activists call for return ...,Brexit: Iceland president says UK can join 'tr...,...,"US, Canada and Mexico pledge 50% of power from...",There is increasing evidence that Australia is...,"Richard Branson, the founder of Virgin Group, ...","37,000-yr-old skull from Borneo reveals surpri...",Palestinians stone Western Wall worshipers; po...,Jean-Claude Juncker asks Farage: Why are you h...,"""Romanians for Remainians"" offering a new home...",Brexit: Gibraltar in talks with Scotland to st...,8 Suicide Bombers Strike Lebanon,Mexico's security forces routinely use 'sexual...
4098,2016-06-29,1,Explosion At Airport In Istanbul,Yemeni former president: Terrorism is the offs...,UK must accept freedom of movement to access E...,Devastated: scientists too late to captive bre...,British Labor Party leader Jeremy Corbyn loses...,A Muslim Shop in the UK Was Just Firebombed Wh...,Mexican Authorities Sexually Torture Women in ...,UK shares and pound continue to recover,...,"Escape Tunnel, Dug by Hand, Is Found at Holoca...",The land under Beijing is sinking by as much a...,Car bomb and Anti-Islamic attack on Mosque in ...,Emaciated lions in Taiz Zoo are trapped in blo...,Rupert Murdoch describes Brexit as 'wonderful'...,More than 40 killed in Yemen suicide attacks,Google Found Disastrous Symantec and Norton Vu...,Extremist violence on the rise in Germany: Dom...,BBC News: Labour MPs pass Corbyn no-confidence...,Tiny New Zealand town with 'too many jobs' lau...
4099,2016-06-30,1,Jamaica proposes marijuana dispensers for tour...,Stephen Hawking says pollution and 'stupidity'...,Boris Johnson says he will not run for Tory pa...,Six gay men in Ivory Coast were abused and for...,Switzerland denies citizenship to Muslim immig...,Palestinian terrorist stabs israeli teen girl ...,Puerto Rico will default on $1 billion of debt...,Republic of Ireland fans to be awarded medal f...,...,Googles free wifi at Indian railway stations i...,Mounting evidence suggests 'hobbits' were wipe...,The men who carried out Tuesday's terror attac...,Calls to suspend Saudi Arabia from UN Human Ri...,More Than 100 Nobel Laureates Call Out Greenpe...,British pedophile sentenced to 85 years in US ...,"US permitted 1,200 offshore fracks in Gulf of ...",We will be swimming in ridicule - French beach...,UEFA says no minutes of silence for Istanbul v...,Law Enforcement Sources: Gun Used in Paris Ter...


In [None]:
train = data[data['Date'] < '20150101']
test = data[data['Date'] >= '20150101']

In [None]:
td = train.iloc[:,2:27]
td.replace("[^a-zA-Z]", " ", regex=True, inplace = True)

list1 = [i for i in range(25)]
new_index = [str(i) for i in list1]
td.columns = new_index
td.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,A hindrance to operations extracts from the...,Scorecard,Hughes instant hit buoys Blues,Jack gets his skates on at ice cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,Derby raise a glass to Strupar s debut double,Southgate strikes Leeds pay the penalty,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl s successor drawn into scandal,The difference between men and women,Sara Denver nurse turned solicitor,Diana s landmine crusade put Tories in a panic,Yeltsin s resignation caught opposition flat f...,Russian roulette,Sold out,Recovering a title
1,Scorecard,The best lake scene,Leader German sleaze inquiry,Cheerio boyo,The main recommendations,Has Cubie killed fees,Has Cubie killed fees,Has Cubie killed fees,Hopkins furious at Foster s lack of Hannibal...,Has Cubie killed fees,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man s extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn t know without the ...,Millennium bug fails to bite


In [None]:
for ind in new_index:
  td[ind] = td[ind].str.lower()
td.head(1)  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a hindrance to operations extracts from the...,scorecard,hughes instant hit buoys blues,jack gets his skates on at ice cold alex,chaos as maracana builds up for united,depleted leicester prevail as elliott spoils e...,hungry spurs sense rich pickings,gunners so wide of an easy target,derby raise a glass to strupar s debut double,southgate strikes leeds pay the penalty,...,flintoff injury piles on woe for england,hunters threaten jospin with new battle of the...,kohl s successor drawn into scandal,the difference between men and women,sara denver nurse turned solicitor,diana s landmine crusade put tories in a panic,yeltsin s resignation caught opposition flat f...,russian roulette,sold out,recovering a title


In [None]:
td.tail(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
3974,microsoft corp said on wednesday it will begin...,greek orthodox church homosexuals are god s c...,vladmir putin officially dissolves roscosmos ...,donald trump could be refused entry to the uk ...,saudi arabia says its ready to meet any additi...,new year fireworks and festivities have been c...,new species of human may have shared our caves...,egypt becomes second nation to shut down faceb...,polish government takes control of public medi...,from jan uk to stop financial aid to india ...,...,ramadi residents fleeing isis they wanted to...,israeli gov t approves nis billion plan to ...,russia denies absurd claims that air strikes...,web attack knocks bbc websites offline,iran says any u s sanctions on missiles illegal,maternity benefits indian government plans to...,rat on a plane forces air india flight to tu...,philippines to join china backed aiib infrastr...,espn invites five fifa presidential candidates...,hamas to keep palestinians from ringing in new...


In [None]:
headline = []
for row in range(len(td)):    
  headline.append(" ".join(str(x) for x in td.iloc[row,0:25]))

In [None]:
headline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier


In [None]:
cv = CountVectorizer(ngram_range=(2,2))
train_d = cv.fit_transform(headline)

In [None]:
model = RandomForestClassifier(n_estimators=200, criterion='entropy')
model.fit(train_d, train['Label'])

RandomForestClassifier(criterion='entropy', n_estimators=200)

In [None]:
test_headline = []
for row in range(len(test)):    
  test_headline.append(" ".join(str(x) for x in test.iloc[row,2:27]))

test_d = cv.transform(test_headline)  

In [None]:
len(test_headline)

126

In [None]:
pred = model.predict(test_d)

In [None]:
len(pred)

126

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(test['Label'], pred)

In [None]:
cm

array([[10, 46],
       [13, 57]])

 # Fake news classifier

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/train.csv", encoding='latin-1')
df.head(1)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...,1


In [None]:
x = df.drop('label', axis=1)
x.head(1)

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...


In [None]:
y = df['label']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
df = df.dropna()

In [None]:
message = df.copy()
message.reset_index(inplace=True)

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

ps = PorterStemmer()

corpus = []

for i in range(len(message)):
  review = re.sub("[^a-zA-Z]"," ", message['title'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
  review = " ".join(review)
  corpus.append(review)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [None]:
y=message['label']

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [None]:
cv.get_feature_names()

In [None]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (2, 2),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [None]:
count_df = pd.DataFrame(x_train)
count_df

In [None]:
from sklearn import naive_bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(x_train, y_train)
pred = model.predict(x_test)

In [None]:
from nltk import metrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm

array([[1907,  175],
       [ 677,  898]])

# passive aggressive classifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
lin_model = PassiveAggressiveClassifier(max_iter=50)

In [None]:
lin_model.fit(x_train, y_train)
pred = lin_model.predict(x_test)
cm = confusion_matrix(y_test, pred)
cm

array([[1643,  439],
       [  56, 1519]])

# multinomial classifier with hyperparameter

In [None]:
classifier = MultinomialNB(alpha=0.1)

In [None]:
import sklearn
import numpy as np
from sklearn.metrics import accuracy_score

prev_score = 0
for alpha in np.arange(0,1,0.1):
  
  sub_model = MultinomialNB(alpha=alpha)
  sub_model.fit(x_train, y_train)
  y_pred = sub_model.predict(x_test)
  score = accuracy_score(y_test, y_pred)
  print(score)

  % _ALPHA_MIN


0.7571780147662018
0.7686628383921247
0.7697566311184031
0.7700300792999727
0.7700300792999727
0.7681159420289855
0.7667487011211376
0.7670221493027072
0.7670221493027072
0.7667487011211376


# Word embedding with keras

In [None]:
from keras.preprocessing.text import one_hot

In [None]:
sentence = ["the glass of milk",
            "the glass of juice",
            "i am a good boy",
            "understand the meaning",
            "your videos are good"]

In [None]:
voc_size = 10000

In [None]:
onehot_sent = [one_hot(words, voc_size) for words in sentence]
onehot_sent

[[324, 5833, 8413, 4948],
 [324, 5833, 8413, 6840],
 [9625, 7563, 1193, 4809, 1045],
 [6768, 324, 7122],
 [5378, 2176, 1927, 4809]]

# Word embedding

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
import numpy as np

In [None]:
from gensim.matutils import pad
sent_l = 8
embedded = pad_sequences(onehot_sent, padding='pre', maxlen=sent_l)
embedded

array([[   0,    0,    0,    0,  324, 5833, 8413, 4948],
       [   0,    0,    0,    0,  324, 5833, 8413, 6840],
       [   0,    0,    0, 9625, 7563, 1193, 4809, 1045],
       [   0,    0,    0,    0,    0, 6768,  324, 7122],
       [   0,    0,    0,    0, 5378, 2176, 1927, 4809]], dtype=int32)

In [None]:
dim = 15

In [None]:
model = Sequential()
model.add(Embedding(voc_size, 15, input_length=sent_l))
model.compile('adam', 'mse')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 15)             150000    
                                                                 
Total params: 150,000
Trainable params: 150,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.predict(embedded)

In [None]:
model.predict(embedded[0])



array([[-0.03015925, -0.03700181,  0.0446083 ,  0.04006027, -0.03383664,
         0.02871597, -0.03615773, -0.04672622,  0.03441199, -0.02341638,
         0.03590367, -0.03995843, -0.00548173, -0.01363111, -0.03005326],
       [-0.03015925, -0.03700181,  0.0446083 ,  0.04006027, -0.03383664,
         0.02871597, -0.03615773, -0.04672622,  0.03441199, -0.02341638,
         0.03590367, -0.03995843, -0.00548173, -0.01363111, -0.03005326],
       [-0.03015925, -0.03700181,  0.0446083 ,  0.04006027, -0.03383664,
         0.02871597, -0.03615773, -0.04672622,  0.03441199, -0.02341638,
         0.03590367, -0.03995843, -0.00548173, -0.01363111, -0.03005326],
       [-0.03015925, -0.03700181,  0.0446083 ,  0.04006027, -0.03383664,
         0.02871597, -0.03615773, -0.04672622,  0.03441199, -0.02341638,
         0.03590367, -0.03995843, -0.00548173, -0.01363111, -0.03005326],
       [ 0.01255033, -0.01388117,  0.00227948, -0.01808159, -0.03711464,
        -0.03131342,  0.04491737, -0.00208696, 

# 2nd model

In [None]:
# adding a LSTM layer
embedding_vector_feature = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_feature, input_length=sent_l))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 8, 40)             400000    
                                                                 
 dropout (Dropout)           (None, 8, 40)             0         
                                                                 
 lstm_4 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 456,501
Trainable params: 456,501
Non-trainable params: 0
_________________________________________________________________


In [None]:
# scale down the values
from sklearn.preprocessing import MinMaxScaler 

# scale up ( reverse )
scaler.inverse_transform