# TextBlob Coding Help!

## Import the required data bases

In [32]:
import pandas as pd
import sqlalchemy as db

## Getting the files

In [33]:
pmfeedback = pd.read_csv("https://raw.githubusercontent.com/casbdai/notebooks2023/main/Module2/HypeCase/pmfeedback.csv")

In [34]:
!wget https://github.com/casbdai/notebooks2023/raw/main/Module2/HypeCase/hypedb
engine = db.create_engine("sqlite:///hypedb")

--2023-06-29 01:36:37--  https://github.com/casbdai/notebooks2023/raw/main/Module2/HypeCase/hypedb
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... 

connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/casbdai/notebooks2023/main/Module2/HypeCase/hypedb [following]
--2023-06-29 01:36:37--  https://raw.githubusercontent.com/casbdai/notebooks2023/main/Module2/HypeCase/hypedb
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 286720 (280K) [application/octet-stream]
Saving to: ‘hypedb.4’


2023-06-29 01:36:37 (2.25 MB/s) - ‘hypedb.4’ saved [286720/286720]



In [35]:
inspector = db.inspect(engine)
inspector.get_table_names()

['comments', 'ideas']

## Applying Textblob

Run the following code to initialize TextBlob:

In [36]:
!pip install textblob

from textblob import TextBlob

# function that only returns the polarity score of TextBlob
def polarity(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

# function that only returns the subjectivity score of TextBlob
def subjectivity(text):
    try:
        return TextBlob(text).sentiment.subjectivity
    except:
        return None



To apply the code follow the following syntax:

**dataframe["new_column_with_sentiment"] = dataframe["column_to_analyze"].apply(sentiment)**

In [37]:
comments=pd.read_sql(db.text("SELECT * FROM comments;"), engine.connect())
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IdeaID          87 non-null     object
 1   SampleComments  87 non-null     object
dtypes: object(2)
memory usage: 1.5+ KB


In [38]:
ideas=pd.read_sql(db.text("SELECT * FROM ideas;"), engine.connect())
ideas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   IdeaID          100 non-null    object 
 1   Date            100 non-null    object 
 2   Title           100 non-null    object 
 3   Description     100 non-null    object 
 4   NumberVotings   100 non-null    float64
 5   NumberComments  100 non-null    int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [39]:

# existing Date column: trim to display date, not time
ideas["Date"]=pd.to_datetime(ideas["Date"])

# new measure Age: days between today and the day the idea was submitted
ideas["Age"]= (pd.Timestamp.today() - ideas["Date"]).dt.days
# new Measure NumVotings_PerDay: no. of votes divided by Age in days, 0 denominators result in NaN. This can be another way of ranking ideas
ideas.loc[ideas["Age"] > 0, "NumVotings_PerDay"] = ideas["NumberVotings"] / ideas["Age"]
ideas.tail()
#ideas.info()

Unnamed: 0,IdeaID,Date,Title,Description,NumberVotings,NumberComments,Age,NumVotings_PerDay
95,4AA8E2252C42615B,2019-12-27,エクスポート,PowerSlidesにエクスポートした時、\n画像でなくグラフでエクスポートして欲しい。\...,1.0,0,7,0.142857
96,FADD84AD04F5E77E,2016-04-14,scheduled export of a report to pdf,It would be nice if there is an option to have...,5779.0,554,1359,4.252391
97,4B90250700F8FB9D,2020-01-03,dates as legends not formatted correctly,I am trying to do a simple clustered column ch...,1.0,0,0,
98,D81BA1BF75A5D348,2019-03-05,Add an interface in Analytics BI Reports for b...,A current capability we use in Analytics BI Re...,98.0,1,304,0.322368
99,4C2A6BC685232A86,2019-11-28,Export to PDF with Custom Font,Custom fonts can be embedded to reports in the...,12.0,1,36,0.333333


In [40]:
pmfeedback.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     10 non-null     object
 1   Selected  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


In [41]:
comments["comment_subjectivity"] = comments["SampleComments"].apply(subjectivity)
comments["comment_subjectivity_rank"] = comments["comment_subjectivity"].rank()
comments["comment_polarity"] = comments["SampleComments"].apply(polarity)
comments["comment_polarity_rank"] = comments["comment_polarity"].rank()
comments.tail()

Unnamed: 0,IdeaID,SampleComments,comment_subjectivity,comment_subjectivity_rank,comment_polarity,comment_polarity_rank
82,B31FC439E10E7040,"Tuesday, 23. Juli 2019, 15:46 Uhr · report s...",0.457693,38.0,0.136201,39.0
83,B5ADDD323460D256,"Wednesday, 19. September 2018, 17:55 Uhr · r...",0.350699,13.0,0.062238,22.0
84,FADD84AD04F5E77E,"Thursday, 26. Dezember 2019, 21:05 Uhr · rep...",0.536746,62.0,0.192485,53.0
85,D81BA1BF75A5D348,"Thursday, 07. März 2019, 14:05 Uhr · report ...",0.75,83.0,0.8,86.5
86,4C2A6BC685232A86,"Wednesday, 04. Dezember 2019, 15:07 Uhr · re...",0.0,2.5,0.0,10.5


In [42]:
ideas["Description_subjectivity"] = ideas["Description"].apply(subjectivity)
ideas["Description_subjectivity_rank"] = ideas["Description_subjectivity"].rank()
ideas["Description_polarity"] = ideas["Description"].apply(polarity)
ideas["Description_polarity_rank"] = ideas["Description_polarity"].rank()
ideas.tail()

Unnamed: 0,IdeaID,Date,Title,Description,NumberVotings,NumberComments,Age,NumVotings_PerDay,Description_subjectivity,Description_subjectivity_rank,Description_polarity,Description_polarity_rank
95,4AA8E2252C42615B,2019-12-27,エクスポート,PowerSlidesにエクスポートした時、\n画像でなくグラフでエクスポートして欲しい。\...,1.0,0,7,0.142857,0.0,7.0,0.0,20.0
96,FADD84AD04F5E77E,2016-04-14,scheduled export of a report to pdf,It would be nice if there is an option to have...,5779.0,554,1359,4.252391,0.825,97.0,0.45,96.0
97,4B90250700F8FB9D,2020-01-03,dates as legends not formatted correctly,I am trying to do a simple clustered column ch...,1.0,0,0,,0.357143,31.0,0.0,20.0
98,D81BA1BF75A5D348,2019-03-05,Add an interface in Analytics BI Reports for b...,A current capability we use in Analytics BI Re...,98.0,1,304,0.322368,0.549643,77.0,0.07,45.0
99,4C2A6BC685232A86,2019-11-28,Export to PDF with Custom Font,Custom fonts can be embedded to reports in the...,12.0,1,36,0.333333,0.225,18.0,0.25,78.0


In [43]:
ideas_comments =  pd.merge(ideas, comments,
                  how="left",
                  left_on=["IdeaID"],
                  right_on=["IdeaID"])

ideas_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   IdeaID                         100 non-null    object        
 1   Date                           100 non-null    datetime64[ns]
 2   Title                          100 non-null    object        
 3   Description                    100 non-null    object        
 4   NumberVotings                  100 non-null    float64       
 5   NumberComments                 100 non-null    int64         
 6   Age                            100 non-null    int64         
 7   NumVotings_PerDay              99 non-null     float64       
 8   Description_subjectivity       100 non-null    float64       
 9   Description_subjectivity_rank  100 non-null    float64       
 10  Description_polarity           100 non-null    float64       
 11  Description_polarity

In [44]:
ideas_comments_pm =  pd.merge(ideas_comments, pmfeedback,
                  how="left",
                  left_on=["Title"],
                  right_on=["Title"])

ideas_comments_pm.info()

ideas_comments_pm.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   IdeaID                         100 non-null    object        
 1   Date                           100 non-null    datetime64[ns]
 2   Title                          100 non-null    object        
 3   Description                    100 non-null    object        
 4   NumberVotings                  100 non-null    float64       
 5   NumberComments                 100 non-null    int64         
 6   Age                            100 non-null    int64         
 7   NumVotings_PerDay              99 non-null     float64       
 8   Description_subjectivity       100 non-null    float64       
 9   Description_subjectivity_rank  100 non-null    float64       
 10  Description_polarity           100 non-null    float64       
 11  Description_polarity

Unnamed: 0,IdeaID,Date,Title,Description,NumberVotings,NumberComments,Age,NumVotings_PerDay,Description_subjectivity,Description_subjectivity_rank,Description_polarity,Description_polarity_rank,SampleComments,comment_subjectivity,comment_subjectivity_rank,comment_polarity,comment_polarity_rank,Selected
95,4AA8E2252C42615B,2019-12-27,エクスポート,PowerSlidesにエクスポートした時、\n画像でなくグラフでエクスポートして欲しい。\...,1.0,0,7,0.142857,0.0,7.0,0.0,20.0,,,,,,
96,FADD84AD04F5E77E,2016-04-14,scheduled export of a report to pdf,It would be nice if there is an option to have...,5779.0,554,1359,4.252391,0.825,97.0,0.45,96.0,"Thursday, 26. Dezember 2019, 21:05 Uhr · rep...",0.536746,62.0,0.192485,53.0,Yes
97,4B90250700F8FB9D,2020-01-03,dates as legends not formatted correctly,I am trying to do a simple clustered column ch...,1.0,0,0,,0.357143,31.0,0.0,20.0,,,,,,
98,D81BA1BF75A5D348,2019-03-05,Add an interface in Analytics BI Reports for b...,A current capability we use in Analytics BI Re...,98.0,1,304,0.322368,0.549643,77.0,0.07,45.0,"Thursday, 07. März 2019, 14:05 Uhr · report ...",0.75,83.0,0.8,86.5,
99,4C2A6BC685232A86,2019-11-28,Export to PDF with Custom Font,Custom fonts can be embedded to reports in the...,12.0,1,36,0.333333,0.225,18.0,0.25,78.0,"Wednesday, 04. Dezember 2019, 15:07 Uhr · re...",0.0,2.5,0.0,10.5,


In [45]:
ideas_comments_pmNO_NaN0 = ideas_comments_pm.fillna({"Selected":"No","comment_subjectivity":0,"comment_polarity":0})
#print(ideas_comments_pmNO_NaN0.tail())
print(ideas_comments_pmNO_NaN0.info())
ideas_comments_pmNO_NaNdropped = ideas_comments_pmNO_NaN0.dropna()
print(ideas_comments_pmNO_NaNdropped.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   IdeaID                         100 non-null    object        
 1   Date                           100 non-null    datetime64[ns]
 2   Title                          100 non-null    object        
 3   Description                    100 non-null    object        
 4   NumberVotings                  100 non-null    float64       
 5   NumberComments                 100 non-null    int64         
 6   Age                            100 non-null    int64         
 7   NumVotings_PerDay              99 non-null     float64       
 8   Description_subjectivity       100 non-null    float64       
 9   Description_subjectivity_rank  100 non-null    float64       
 10  Description_polarity           100 non-null    float64       
 11  Description_polarity

In [46]:
print("ideas_comments_pmNO_NaNo mean")
print(ideas_comments_pmNO_NaN0.groupby("Selected")[["comment_polarity","comment_polarity_rank", "comment_subjectivity","comment_subjectivity_rank","NumberComments"]].mean())
print("ideas_comments_pmNO_NaNdropped mean")
print(ideas_comments_pmNO_NaNdropped.groupby("Selected")[["comment_polarity","comment_polarity_rank", "comment_subjectivity","comment_subjectivity_rank","NumberComments"]].mean())
print("ideas_comments_pmNO_NaNo median")
print(ideas_comments_pmNO_NaN0.groupby("Selected")[["comment_polarity","comment_polarity_rank", "comment_subjectivity","comment_subjectivity_rank","NumberComments"]].median())
print("ideas_comments_pmNO_NaNdropped median")
print(ideas_comments_pmNO_NaNdropped.groupby("Selected")[["comment_polarity","comment_polarity_rank", "comment_subjectivity","comment_subjectivity_rank","NumberComments"]].median())
print("ideas_comments_pmNO_NaNdropped mean")
print(ideas_comments_pmNO_NaNdropped.groupby("Selected")[["comment_polarity", "comment_subjectivity","Description_polarity", "Description_subjectivity" ]].mean())


ideas_comments_pmNO_NaNo mean
          comment_polarity  comment_polarity_rank  comment_subjectivity  \
Selected                                                                  
No                0.160677               44.25974              0.393365   
Yes               0.160662               42.00000              0.499339   

          comment_subjectivity_rank  NumberComments  
Selected                                             
No                         43.25974             8.1  
Yes                        49.70000           165.7  
ideas_comments_pmNO_NaNdropped mean
          comment_polarity  comment_polarity_rank  comment_subjectivity  \
Selected                                                                  
No                0.187804               44.25974              0.459778   
Yes               0.160662               42.00000              0.499339   

          comment_subjectivity_rank  NumberComments  
Selected                                             
No      