# Exp 1 - Sentiment Analysis by a movie

In [39]:
import pandas as pd
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sidsr\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [40]:
# reading and wragling data

df_avatar = pd.read_csv('IMDB Dataset.csv', engine='python')

df_avatar_lines = df_avatar.groupby('sentiment').count()

df_avatar_lines = df_avatar_lines.sort_values(by=['review'], ascending=False)[:10]

top_character_names = df_avatar_lines.index.values

In [41]:
# filtering out non-top characters

df_character_sentiment = df_avatar[df_avatar['sentiment'].isin(top_character_names)]

df_character_sentiment = df_character_sentiment[['sentiment', 'review']]

In [42]:
# calculating sentiment score

sid = SentimentIntensityAnalyzer()

df_character_sentiment.reset_index(inplace=True, drop=True)

df_character_sentiment[['neg', 'neu', 'pos', 'compound']] = df_character_sentiment['review'].apply(sid.polarity_scores).apply(pd.Series)

In [43]:
df_character_sentiment

Unnamed: 0,sentiment,review,neg,neu,pos,compound
0,positive,One of the other reviewers has mentioned that ...,0.203,0.748,0.048,-0.9951
1,positive,A wonderful little production. <br /><br />The...,0.053,0.776,0.172,0.9641
2,positive,I thought this was a wonderful way to spend ti...,0.094,0.714,0.192,0.9605
3,negative,Basically there's a family where a little boy ...,0.138,0.797,0.065,-0.9213
4,positive,"Petter Mattei's ""Love in the Time of Money"" is...",0.052,0.801,0.147,0.9744
...,...,...,...,...,...,...
49995,positive,I thought this movie did a down right good job...,0.047,0.753,0.199,0.9890
49996,negative,"Bad plot, bad dialogue, bad acting, idiotic di...",0.166,0.720,0.114,-0.6693
49997,negative,I am a Catholic taught in parochial elementary...,0.208,0.683,0.108,-0.9851
49998,negative,I'm going to have to disagree with the previou...,0.105,0.813,0.082,-0.7648


# Exp 2 - Named Identity Recognition using Spacy

In [26]:
import spacy

nlp = spacy.load('en_core_web_sm')

sentence = "Apple is looking at buying U.K. startup for $1 billion"

doc = nlp(sentence)

for ent in doc.ents:
	print(ent.text, ent.start_char, ent.end_char, ent.label_)


Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


# Exp 3 - Stemming & Lemmatization

Stemming

In [27]:
# importing modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)

for w in words:
	print(w, " : ", ps.stem(w))

Programmers  :  programm
program  :  program
with  :  with
programming  :  program
languages  :  languag


Lemmatization

In [28]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Create WordNetLemmatizer object
wnl = WordNetLemmatizer()

# single word lemmatization examples
list1 = ['kites', 'babies', 'dogs', 'flying', 'smiling',
		'driving', 'died', 'tried', 'feet']
for words in list1:
	print(words + " ---> " + wnl.lemmatize(words))
	
#> kites ---> kite
#> babies ---> baby
#> dogs ---> dog
#> flying ---> flying
#> smiling ---> smiling
#> driving ---> driving
#> died ---> died
#> tried ---> tried
#> feet ---> foot


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sidsr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


kites ---> kite
babies ---> baby
dogs ---> dog
flying ---> flying
smiling ---> smiling
driving ---> driving
died ---> died
tried ---> tried
feet ---> foot


# Exp 4 - Bag of words

In [34]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

text = ["I love watching Netflix. I love watching Suits", "I hate going out. I hate eating out"]
df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(),
 index=df['review'].values,
 columns=cv.get_feature_names())
df_dtm

Unnamed: 0,eating,going,hate,love,netflix,suits,watching
review1,0,0,0,2,1,1,2
review2,1,1,2,0,0,0,0


# Exp 5 - Term Frequency–Inverse Document Frequency(TF - IDF)

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
text = ["I love writing code in Python. I love Python code",
 "I hate writing code in Java. I hate Java code"]
df = pd.DataFrame({'review': ['review1', 'review2'], 'text':text})
tfidf = TfidfVectorizer(stop_words='english', norm=None)
tfidf_matrix = tfidf.fit_transform(df['text'])
df_dtm = pd.DataFrame(tfidf_matrix.toarray(),
 index=df['review'].values,
 columns=tfidf.get_feature_names())
df_dtm

Unnamed: 0,code,hate,java,love,python,writing
review1,2.0,0.0,0.0,2.81093,2.81093,1.0
review2,2.0,2.81093,2.81093,0.0,0.0,1.0


# Exp 6 - Stopwords

In [35]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sidsr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [37]:
sw_nltk = stopwords.words('english')
print(sw_nltk)
print(len(sw_nltk))
text = "When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York."
words = [word for word in text.split() if word.lower() not in sw_nltk]
new_text = " ".join(words)
print(new_text)
print("Old length: ", len(text))
print("New length: ", len(new_text))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Exp 7 - POS Tagging

In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sidsr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [41]:
txt = "Sukanya, Rajib and Naba are my good friends. " \
 "Sukanya is getting married next year. " \
 "Marriage is a big step in one’s life." \
 "It is both exciting and frightening. " \
 "But friendship is a sacred bond between people." \
 "It is a special kind of love between us. " \
 "Many of you must have tried searching for a friend "\
 "but never found the right one."
 
# sent_tokenize is one of instances of
# PunktSentenceTokenizer from the nltk.tokenize.punkt module

tokenized = sent_tokenize(txt)
for i in tokenized:
    # Word tokenizers is used to find the words
    # and punctuation in a string
    wordsList = nltk.word_tokenize(i)
    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words]
    # Using a Tagger. Which is part-of-speech
    # tagger or POS-tagger.
    tagged = nltk.pos_tag(wordsList)
    print(tagged)


[('Sukanya', 'NNP'), (',', ','), ('Rajib', 'NNP'), ('Naba', 'NNP'), ('good', 'JJ'), ('friends', 'NNS'), ('.', '.')]
[('Sukanya', 'NNP'), ('getting', 'VBG'), ('married', 'VBN'), ('next', 'JJ'), ('year', 'NN'), ('.', '.')]
[('Marriage', 'NN'), ('big', 'JJ'), ('step', 'NN'), ('one', 'CD'), ('’', 'NN'), ('life.It', 'NN'), ('exciting', 'VBG'), ('frightening', 'NN'), ('.', '.')]
[('But', 'CC'), ('friendship', 'NN'), ('sacred', 'VBD'), ('bond', 'NN'), ('people.It', 'NN'), ('special', 'JJ'), ('kind', 'NN'), ('love', 'VB'), ('us', 'PRP'), ('.', '.')]
[('Many', 'JJ'), ('must', 'MD'), ('tried', 'VB'), ('searching', 'VBG'), ('friend', 'NN'), ('never', 'RB'), ('found', 'VBD'), ('right', 'JJ'), ('one', 'CD'), ('.', '.')]


# Exp 8 - Chunking

In [42]:
import nltk
sentence = [
 ("the", "DT"),
 ("book", "NN"),
 ("has","VBZ"),
 ("many","JJ"),
 ("chapters","NNS")
]
chunker = nltk.RegexpParser(
 r'''
 NP:{<DT><NN.*><.*>*<NN.*>}
 }<VB.*>{
 '''
)
chunker.parse(sentence)
Output = chunker.parse(sentence)
print(Output)

(S (NP the/DT book/NN) has/VBZ (NP many/JJ chapters/NNS))


# Exp 9 - WordNet

In [43]:
import nltk
from nltk.corpus import wordnet

In [45]:
synonyms = []
antonyms = []
for synset in wordnet.synsets("evil"):
    for l in synset.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))

{'malign', 'vicious', 'evilness', 'malefic', 'immorality', 'wickedness', 'malevolent', 'evil', 'iniquity'}
{'goodness', 'good'}


# Exp 10 - Word Cloud

In [47]:
pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.2-cp38-cp38-win_amd64.whl (153 kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud

In [19]:
class WordCloudGeneration:
    def preprocessing(self, data):
        # convert all words to lowercase
        data = [item.lower() for item in data]
        # load the stop_words of english
        stop_words = set(stopwords.words('english'))
        # concatenate all the data with spaces.
        paragraph = ' '.join(data)
        # tokenize the paragraph using the inbuilt tokenizer
        word_tokens = word_tokenize(paragraph) 
        # filter words present in stopwords list 
        preprocessed_data = ' '.join([word for word in word_tokens if not word in stop_words])
        print("\n Preprocessed Data: " ,preprocessed_data)
        return preprocessed_data
    def create_word_cloud(self, final_data):
        # initiate WordCloud object with parameters width, height, maximum font size and background 
        color = "red"
        # call the generate method of WordCloud class to generate an image
        wordcloud = WordCloud(font_path="sans-serif", width=1600, height=800, max_font_size=200, background_color="black").generate(final_data)   
        # plt the image generated by WordCloud class
        plt.figure(figsize=(12,10))
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.show()
wordcloud_generator = WordCloudGeneration()
# you may uncomment the following line to use custom input
# input_text = input("Enter the text here: ")
input_text = 'These datasets are used for machine-learning research and have been cited in peer reviewed academic journals. Datasets are an integral part of the field of machine learning. Major advances in this field can result from advances in learning algorithms (such as deep learning), computer hardware, and, less-intuitively, the availability of high-quality training datasets.[1] High quality labeled training datasets for supervised and semi-supervised machine learning algorithms are usually difficult and expensive to produce because of the large amount of time needed to label the data. Although they do not need to be labeled, high-quality datasets for unsupervised learning can also be difficult and costly to produce.'
input_text = input_text.split('.')

clean_data = wordcloud_generator.preprocessing(input_text)

wordcloud_generator.create_word_cloud(clean_data)


 Preprocessed Data:  datasets used machine-learning research cited peer reviewed academic journals datasets integral part field machine learning major advances field result advances learning algorithms ( deep learning ) , computer hardware , , less-intuitively , availability high-quality training datasets [ 1 ] high quality labeled training datasets supervised semi-supervised machine learning algorithms usually difficult expensive produce large amount time needed label data although need labeled , high-quality datasets unsupervised learning also difficult costly produce


OSError: cannot open resource