In [1]:
import pandas as pd
import numpy as np
import json

<b>IMPORTING THE FILE</b>

In [2]:
with open('tweets.json') as jfile:
    for line in jfile:
        data = json.loads(line)
    d = pd.DataFrame(data)
df = d.T
df.head()

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [3]:
# Resetting the index to get rid of keys

df = df.reset_index(drop=True)
df.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


<b><h2>CLEANING THE DATA</h2></b>

In [4]:
import string
import re

In [5]:
# Special characters that are included in the punctuation list
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
# Defining a function that replaces unwanted text with nothing, removes special characters and turn text into lowercase

def clean_text(text):
    
    text = re.sub(r'https?:\/\/s+',' ',text)
    text = re.sub('[^a-zA-Z]',' ',text)
    text = "".join([c for c in text if c not in string.punctuation])
    text = text.lower()
    return text


In [7]:
# Applying the clean_text function to tweet_text

df['tweet_text'] = df['tweet_text'].apply(clean_text)
df.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,scientists conducted a phase ii study of ac...
1,"Michael Wang, MD",this phase acalabrutinib venetoclax av tri...
2,1stOncology,nice backs astrazenecas calquence for cll ...
3,Toby Eyre,acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,nice has recommended the use of acalabrutinib ...


In [8]:
df.isnull().values.any()

False

<b><h1>OBJECTIVE 1:</h1>\
    Get the most frequent entities from the tweets</b>

In [9]:
# Importing nltk and downloading the punkt module for tokenizing

import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Prateek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# Dowloading a list of commonly used words such as 'the', 'a', 'an', 'in' etc, which add no value to our analysis

nltk.download("stopwords")
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prateek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Downloading WordnetLemmatizer that groups together the different inflected forms of a word to analyze as a single item 

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Prateek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


\
\
<b>TOKENIZATION, LEMMATIZATION, STOPWORDS</b>

In [12]:
# Creating a list that contains tokenized and lemmatized text, and excludes words in the stopwords list.

compilation = []

for x in range(0, len(df)):
    words = word_tokenize(df['tweet_text'][x],'english')
    words = [wn.lemmatize(word) for word in words if word not in set(stoplist)]
    words = " ".join(words)
    compilation.append(words)
               

In [13]:
# Adding another column 'Compiled Text' to the dataframe df and dropping the 'tweet_text' column

df['Compiled Text'] = compilation
df.drop(columns = 'tweet_text', inplace = True)
df.head()

Unnamed: 0,tweet_author,Compiled Text
0,Hematopoiesis News,scientist conducted phase ii study acalabrutin...
1,"Michael Wang, MD",phase acalabrutinib venetoclax av trial still ...
2,1stOncology,nice back astrazenecas calquence cll http co v...
3,Toby Eyre,acalabrutinib valuable option pt intolerant ib...
4,Lymphoma Hub,nice recommended use acalabrutinib patient tre...


In [14]:
# Renaming the column 'tweet_author' to Author

df.rename(columns = {'tweet_author':'Author'}, inplace = True)
df.head()

Unnamed: 0,Author,Compiled Text
0,Hematopoiesis News,scientist conducted phase ii study acalabrutin...
1,"Michael Wang, MD",phase acalabrutinib venetoclax av trial still ...
2,1stOncology,nice back astrazenecas calquence cll http co v...
3,Toby Eyre,acalabrutinib valuable option pt intolerant ib...
4,Lymphoma Hub,nice recommended use acalabrutinib patient tre...


In [15]:
# The .join() method takes all items in an iterable and joins them into one string

chain_text = "".join(df['Compiled Text'])

In [16]:
# The .split() method splits a string into a list

split_text = chain_text.split()

In [17]:
# Counter is a container included in the collections module

from collections import Counter

In [18]:
# Counting the split text and getting those words with the highest repetition

freq = Counter(split_text)
rep_words = freq.most_common()
print(rep_words)



\
<b>Extracting the words as csv for Objective 1:</b>

In [65]:
obj_1 = pd.DataFrame(rep_words)

In [66]:
obj_1.to_csv('Objective_1.csv')

<b><h1>OBJECTIVE 2:</h1>\
    Find out the sentiment/polarity of each author towards each of the entities</b>

In [19]:
# Importing the file

with open('tweets.json') as jfile:
    for line in jfile:
        data2 = json.loads(line)
    d2 = pd.DataFrame(data2)
df2 = d2.T
df2.head()

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [20]:
# Resetting the index to get rid of the keys

df2 = df2.reset_index(drop=True)
df2.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [21]:
# Defining a function that replaces unwanted text with nothing, removes special characters and turn text into lowercase

def clean_text_2(text):
    
    text = re.sub(r'https?:\/\/s+',' ',text)
    text = re.sub('[^a-zA-Z]',' ',text)
    text = "".join([c for c in text if c not in string.punctuation])
    text = text.lower()
    return text


In [22]:
# Applying the clean_text function to tweet_text 

df2['tweet_text'] = df2['tweet_text'].apply(clean_text_2)
df2.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,scientists conducted a phase ii study of ac...
1,"Michael Wang, MD",this phase acalabrutinib venetoclax av tri...
2,1stOncology,nice backs astrazenecas calquence for cll ...
3,Toby Eyre,acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,nice has recommended the use of acalabrutinib ...


In [23]:
# Importing Textblob for classification

from textblob import TextBlob

In [24]:
# Defining a function to use Textblob to get subjectivity of the authors with respect to their tweet

def author_sentiment(text):
    view = TextBlob(text).sentiment.subjectivity
    return view

In [25]:
# Applying the 'author_sentiment' function to tweet_text to get 'author_Subjectivity' column

df2['author_Subjectivity'] = df2['tweet_text'].apply(author_sentiment)
df2.head()

Unnamed: 0,tweet_author,tweet_text,author_Subjectivity
0,Hematopoiesis News,scientists conducted a phase ii study of ac...,0.0
1,"Michael Wang, MD",this phase acalabrutinib venetoclax av tri...,0.0
2,1stOncology,nice backs astrazenecas calquence for cll ...,1.0
3,Toby Eyre,acalabrutinib is a valuable option in pts int...,0.4
4,Lymphoma Hub,nice has recommended the use of acalabrutinib ...,0.75


In [26]:
# Defining a function to use Textblob to get a polarity score of the authors with respect to their tweet

def author_polarity(text):
    return TextBlob(text).sentiment.polarity

In [27]:
# Applying the 'author_polarity' function to tweet_text to get 'author_Polarity' column

df2['author_Polarity'] = df2['tweet_text'].apply(author_polarity)
df2.head()

Unnamed: 0,tweet_author,tweet_text,author_Subjectivity,author_Polarity
0,Hematopoiesis News,scientists conducted a phase ii study of ac...,0.0,0.0
1,"Michael Wang, MD",this phase acalabrutinib venetoclax av tri...,0.0,0.0
2,1stOncology,nice backs astrazenecas calquence for cll ...,1.0,0.6
3,Toby Eyre,acalabrutinib is a valuable option in pts int...,0.4,0.05
4,Lymphoma Hub,nice has recommended the use of acalabrutinib ...,0.75,0.55


\
\
<b>POSITIVE / NEGATIVE TWEETS</b>

In [28]:
# Defining a function to classify a tweet as 'Negative', 'Neutral', or 'Positive' based on the polarity score

def Pos_or_Neg(polarity):
    
    if polarity < 0:
        return "Negative"
    
    elif polarity == 0:
        return "Neutral"
    
    else:
        return "Positive"

In [29]:
# Adding a new column that describes the sentiment of the tweet using the 'Pos_or_Neg' function

df2['+/-'] = df2['author_Polarity'].apply(Pos_or_Neg)

In [30]:
df2.head(10)

Unnamed: 0,tweet_author,tweet_text,author_Subjectivity,author_Polarity,+/-
0,Hematopoiesis News,scientists conducted a phase ii study of ac...,0.0,0.0,Neutral
1,"Michael Wang, MD",this phase acalabrutinib venetoclax av tri...,0.0,0.0,Neutral
2,1stOncology,nice backs astrazenecas calquence for cll ...,1.0,0.6,Positive
3,Toby Eyre,acalabrutinib is a valuable option in pts int...,0.4,0.05,Positive
4,Lymphoma Hub,nice has recommended the use of acalabrutinib ...,0.75,0.55,Positive
5,David Ledger,nice backs astrazeneca s calquence for cll htt...,1.0,0.6,Positive
6,N Wales Cancer Forum,this is england for now these decisions usua...,0.375,0.125,Positive
7,European Pharmaceutical Review,astrazeneca s calquence acalabrutinib a che...,0.65,0.45,Positive
8,Graham Collins,superstar tobyeyre responding to the excell...,1.0,0.8,Positive
9,CLL Ireland,cll patients all know the drug ibrutinib and y...,0.268182,0.045455,Positive


In [32]:
# Getting the count of Neutral, Positive and Negative tweets

df2['+/-'].value_counts()

Neutral     20389
Positive    19093
Negative     3865
Name: +/-, dtype: int64

\
<b>Extracting the sentiments as csv for Objective 2:</b>

In [67]:
df2.rename(columns = {'tweet_author':'Author'}, inplace = True)

In [69]:
obj_2 = df2[['tweet_text', 'Author', '+/-']]

In [70]:
obj_2.to_csv('Objective_2.csv')

\
\
<b><h2>Further Analysis</h2></b>

In [33]:
with open('tweets.json') as jfile:
    for line in jfile:
        data = json.loads(line)
    d = pd.DataFrame(data)
df3 = d.T
df3.head()

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [34]:
df3 = df3.reset_index(drop=True)
df3.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [65]:
# Finding the number of distinct authors

dist_authors = len(pd.unique(df3['tweet_author']))
dist_authors

9292

In [67]:
df3['# Characters'] = df3['tweet_text'].str.len()
df3.head()

Unnamed: 0,tweet_author,tweet_text,# Characters
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,222
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,241
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,69
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,175
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,169


In [68]:
# Defining a function to calculate the avg word length of a tweet

def avg_word_length(tweet):
    
    words = tweet.split()
    
    return sum(len(word) for word in words)/len(words)

In [69]:
df3['Avg word length'] = df3['tweet_text'].apply(lambda x: avg_word_length(x))

In [71]:
df3.head()

Unnamed: 0,tweet_author,tweet_text,# Characters,Avg word length
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,222,7.222222
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,241,6.30303
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,69,9.0
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,175,6.25
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,169,7.095238
