# NLP concepts:

1. Sentiment Analysis
2. Named Entity Recognition (NER)
3. Stemming
4. Lemmatization
5. Bag of Words (BoW)
6. Term Frequency–Inverse Document Frequency (TF-IDF)
7. Wordcloud

In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# reading and wragling data
df_avatar = pd.read_csv('avatar.csv', engine='python')
df_avatar_lines = df_avatar.groupby('character').count()
df_avatar_lines

Unnamed: 0_level_0,Unnamed: 0,id,book,book_num,chapter,chapter_num,full_text,character_words,writer,director,imdb_rating
character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aang,1796,1796,1796,1796,1796,1796,1796,1796,1796,1796,1766
Aang and Sokka,2,2,2,2,2,2,2,2,2,2,2
Aang and Zuko,1,1,1,1,1,1,1,1,1,1,1
Aang:,1,1,1,1,1,1,1,1,1,1,1
Actor Bumi,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
Yung,10,10,10,10,10,10,10,10,10,10,10
Zei,27,27,27,27,27,27,27,27,27,27,27
Zhang leader,28,28,28,28,28,28,28,28,28,28,28
Zhao,107,107,107,107,107,107,107,107,107,107,94


In [3]:
df_avatar_lines = df_avatar_lines.sort_values(by=['character_words'], ascending=False)[:10]
top_character_names = df_avatar_lines.index.values
print(top_character_names)

['Aang' 'Sokka' 'Katara' 'Zuko' 'Toph' 'Iroh' 'Azula' 'Jet' 'Suki' 'Zhao']


In [4]:
df_avatar_lines

Unnamed: 0_level_0,Unnamed: 0,id,book,book_num,chapter,chapter_num,full_text,character_words,writer,director,imdb_rating
character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aang,1796,1796,1796,1796,1796,1796,1796,1796,1796,1796,1766
Sokka,1639,1639,1639,1639,1639,1639,1639,1639,1639,1639,1627
Katara,1437,1437,1437,1437,1437,1437,1437,1437,1437,1437,1429
Zuko,776,776,776,776,776,776,776,776,776,776,766
Toph,507,507,507,507,507,507,507,507,507,507,507
Iroh,337,337,337,337,337,337,337,337,337,337,328
Azula,211,211,211,211,211,211,211,211,211,211,211
Jet,134,134,134,134,134,134,134,134,134,134,134
Suki,114,114,114,114,114,114,114,114,114,114,114
Zhao,107,107,107,107,107,107,107,107,107,107,94


In [5]:
# filtering out non-top characters
df_character_sentiment = df_avatar[df_avatar['character'].isin(top_character_names)]
df_character_sentiment

Unnamed: 0.1,Unnamed: 0,id,book,book_num,chapter,chapter_num,character,full_text,character_words,writer,director,imdb_rating
0,1,1,Water,1,The Boy in the Iceberg,1,Katara,Water. Earth. Fire. Air. My grandmother used t...,Water. Earth. Fire. Air. My grandmother used t...,"<U+200E>Michael Dante DiMartino, Bryan Konietz...",Dave Filoni,8.1
2,3,3,Water,1,The Boy in the Iceberg,1,Sokka,It's not getting away from me this time. [Clos...,It's not getting away from me this time. Watc...,"<U+200E>Michael Dante DiMartino, Bryan Konietz...",Dave Filoni,8.1
4,5,5,Water,1,The Boy in the Iceberg,1,Katara,"[Happily surprised.] Sokka, look!","Sokka, look!","<U+200E>Michael Dante DiMartino, Bryan Konietz...",Dave Filoni,8.1
5,6,6,Water,1,The Boy in the Iceberg,1,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...","Sshh! Katara, you're going to scare it away. ...","<U+200E>Michael Dante DiMartino, Bryan Konietz...",Dave Filoni,8.1
7,8,8,Water,1,The Boy in the Iceberg,1,Katara,[Struggling with the water that passes right i...,"But, Sokka! I caught one!","<U+200E>Michael Dante DiMartino, Bryan Konietz...",Dave Filoni,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...
13378,13379,13379,Fire,3,"Sozin's Comet, Part 4: Avatar Aang",21,Zuko,At least you don't look like a boar-q-pine! My...,At least you don't look like a boar-q-pine! My...,"Aaron Ehasz, Michael Dante DiMartino, Bryan Ko...",Joaquim Dos Santos,9.8
13380,13381,13381,Fire,3,"Sozin's Comet, Part 4: Avatar Aang",21,Suki,And why did you paint me firebending?,And why did you paint me firebending?,"Aaron Ehasz, Michael Dante DiMartino, Bryan Ko...",Joaquim Dos Santos,9.8
13381,13382,13382,Fire,3,"Sozin's Comet, Part 4: Avatar Aang",21,Sokka,I thought it looked more exciting that way. [M...,I thought it looked more exciting that way. O...,"Aaron Ehasz, Michael Dante DiMartino, Bryan Ko...",Joaquim Dos Santos,9.8
13382,13383,13383,Fire,3,"Sozin's Comet, Part 4: Avatar Aang",21,Iroh,"[Points at painting.] Hey, my belly's not that...","Hey, my belly's not that big anymore. I've rea...","Aaron Ehasz, Michael Dante DiMartino, Bryan Ko...",Joaquim Dos Santos,9.8


In [6]:
df_character_sentiment = df_character_sentiment[['character', 'character_words']]
df_character_sentiment

Unnamed: 0,character,character_words
0,Katara,Water. Earth. Fire. Air. My grandmother used t...
2,Sokka,It's not getting away from me this time. Watc...
4,Katara,"Sokka, look!"
5,Sokka,"Sshh! Katara, you're going to scare it away. ..."
7,Katara,"But, Sokka! I caught one!"
...,...,...
13378,Zuko,At least you don't look like a boar-q-pine! My...
13380,Suki,And why did you paint me firebending?
13381,Sokka,I thought it looked more exciting that way. O...
13382,Iroh,"Hey, my belly's not that big anymore. I've rea..."


In [10]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Karthik\AppData\Roaming\nltk_data...


True

In [11]:
# calculating sentiment score
sid = SentimentIntensityAnalyzer()
df_character_sentiment.reset_index(inplace=True, drop=True)
df_character_sentiment

Unnamed: 0,character,character_words
0,Katara,Water. Earth. Fire. Air. My grandmother used t...
1,Sokka,It's not getting away from me this time. Watc...
2,Katara,"Sokka, look!"
3,Sokka,"Sshh! Katara, you're going to scare it away. ..."
4,Katara,"But, Sokka! I caught one!"
...,...,...
7053,Zuko,At least you don't look like a boar-q-pine! My...
7054,Suki,And why did you paint me firebending?
7055,Sokka,I thought it looked more exciting that way. O...
7056,Iroh,"Hey, my belly's not that big anymore. I've rea..."


In [12]:
df_character_sentiment[['neg', 'neu', 'pos', 'compound']] = df_character_sentiment['character_words'].apply(sid.polarity_scores).apply(pd.Series)
df_character_sentiment

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_character_sentiment[['neg', 'neu', 'pos', 'compound']] = df_character_sentiment['character_words'].apply(sid.polarity_scores).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_character_sentiment[['neg', 'neu', 'pos', 'compound']] = df_character_sentiment['character_words'].apply(sid.polarity_scores).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.p

Unnamed: 0,character,character_words,neg,neu,pos,compound
0,Katara,Water. Earth. Fire. Air. My grandmother used t...,0.196,0.735,0.069,-0.9718
1,Sokka,It's not getting away from me this time. Watc...,0.000,1.000,0.000,0.0000
2,Katara,"Sokka, look!",0.000,1.000,0.000,0.0000
3,Sokka,"Sshh! Katara, you're going to scare it away. ...",0.200,0.800,0.000,-0.5411
4,Katara,"But, Sokka! I caught one!",0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...
7053,Zuko,At least you don't look like a boar-q-pine! My...,0.183,0.817,0.000,-0.4007
7054,Suki,And why did you paint me firebending?,0.000,1.000,0.000,0.0000
7055,Sokka,I thought it looked more exciting that way. O...,0.000,0.687,0.313,0.7501
7056,Iroh,"Hey, my belly's not that big anymore. I've rea...",0.000,1.000,0.000,0.0000


In [1]:
# import spacy # testing for Entity splitting
# nlp = spacy.load("en_core_web_sm")
# doc = nlp("Biden invites Ukrainian president to White House this summer")
# print([(X.text, X.label_) for X in doc.ents])

In [15]:
#!pip install spacy

Collecting spacy
  Downloading spacy-3.4.1-cp39-cp39-win_amd64.whl (11.8 MB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.7-cp39-cp39-win_amd64.whl (96 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.8-cp39-cp39-win_amd64.whl (18 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.2-py3-none-any.whl (27 kB)
Collecting wasabi<1.1.0,>=0.9.1
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.4-cp39-cp39-win_amd64.whl (450 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.2-py3-none-any.whl (42 kB)
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.3-cp39-cp39-win_amd64.whl (1.3 MB)
Collecting langcod

In [17]:
#!pip install en_core_web_sm

ERROR: Could not find a version that satisfies the requirement en_core_web_sm (from versions: none)
ERROR: No matching distribution found for en_core_web_sm
