In [None]:
import pandas as pd
import numpy as np 
import warnings
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px # Imported for GANNT chart creation.
import matplotlib.offsetbox as offsetbox
import matplotlib.image as image
from matplotlib.text import Text
warnings.filterwarnings('ignore')


# 1.1 Project Management

This Project follows the CRISP-DM Project Management Principles.
In order to demonstarate this a GANNT chart using Plotly's express libary (px)

This initial chart will be compared to a completed one at the end of the project

In [None]:
CA2_Gantt_Chart = pd.DataFrame([
    dict(Task="GitHub Configuration", Start='2023-05-07', Finish='2023-05-08', Percentage_Completed=100),
    dict(Task="Get twitter dataset", Start='2023-05-09', Finish='2023-05-12', Percentage_Completed=100),
    dict(Task="Distriubuted Data Processing Enviornment", Start='2023-05-13', Finish='2023-05-14',  Percentage_Completed=100),
    dict(Task="Store data in SQL/NoSQL Database", Start='2023-05-15', Finish='2023-05-16',Percentage_Completed=100),
    dict(Task="Follow up analysis on on the output data", Start='2023-05-17', Finish='2023-05-18', Percentage_Completed=100),
    dict(Task="Testing Strategy - YCSB", Start='2023-05-19', Finish='2023-05-20', Percentage_Completed=100),
    dict(Task="Sentiment Extraction", Start='2023-05-20', Finish='2023-05-21', Percentage_Completed=100),
    dict(Task="Time Series forecasting", Start='2023-05-21', Finish='2023-05-21', Percentage_Completed=100),
    dict(Task="Interactive Dashboard Setup", Start='2023-05-21', Finish='2023-05-22', Percentage_Completed=100),
    dict(Task="Report", Start='2023-05-23', Finish='2023-05-24', Percentage_Completed=100),
    dict(Task="Submission", Start='2023-05-25', Finish='2023-05-26',Percentage_Completed=100),
])

fig = px.timeline(CA2_Gantt_Chart, x_start="Start", x_end="Finish", y="Task", color="Percentage_Completed")
# Title was not centred so used the below to centre to title layout.
fig.update_layout(title_text=" Initial CA2 GANNT Chart", title_x=0.5)
fig.show()

This is a placeholder for the final GANNT Chart for the project. This will completed on completion date.

In [None]:
#CA2_Gantt_Chart = pd.DataFrame([
#    dict(Task="GitHub Configuration", Start='2023-05-07', Finish='2023-05-08', Percentage_Completed=100),
#    dict(Task="Get twitter dataset", Start='2023-05-09', Finish='2023-05-12', Percentage_Completed=100),
#    dict(Task="Distriubuted Data Processing Enviornment", Start='2023-05-13', Finish='2023-05-14',  Percentage_Completed=100),
#    dict(Task="Store data in SQL/NoSQL Database", Start='2023-05-15', Finish='2023-05-16',Percentage_Completed=100),
#    dict(Task="Follow up analysis on on the output data", Start='2023-05-17', Finish='2023-05-18', Percentage_Completed=100),
#    dict(Task="Testing Strategy - YCSB", Start='2023-05-19', Finish='2023-05-20', Percentage_Completed=100),
#    dict(Task="Sentiment Extraction", Start='2023-05-20', Finish='2023-05-21', Percentage_Completed=100),
#    dict(Task="Time Series forecasting", Start='2023-05-21', Finish='2023-05-21', Percentage_Completed=100),
#    dict(Task="Interactive Dashboard Setup", Start='2023-05-21', Finish='2023-05-22', Percentage_Completed=100),
#    dict(Task="Report", Start='2023-05-23', Finish='2023-05-24', Percentage_Completed=100),
#    dict(Task="Submission", Start='2023-05-25', Finish='2023-05-26',Percentage_Completed=100),
#])

#fig = px.timeline(CA2_Gantt_Chart, x_start="Start", x_end="Finish", y="Task", color="Percentage_Completed")
# Title was not centred so used the below to centre to title layout.
#fig.update_layout(title_text=" Initial CA2 GANNT Chart", title_x=0.5)
fig.show()

# 1.2 Data retrieval

## 1.2.1 Assignment links

This section I am testing some possible datasets to be used for the project

Both of the below required an API key, which I currently do not have have

https://datascienceparichay.com/article/get-data-from-twitter-api-in-python-step-by-step-guide/    

https://www.toptal.com/apache/apache-spark-streaming-twitter
    

This was attempted, but server was too slow, almost 1.2 days per file download ( 1 months data )
https://archive.org/details/twitterstream?sort=-publicdate


Due to slowness in the download from the server, I attempted to download a mon


## 1.2.2 Alternative Data sources

Another possible dataset was found on Kaggle.com : 
    
https://www.kaggle.com/datasets/prathamsharma123/farmers-protest-tweets-dataset-csv?resource=download
    
    
Now lets take a quick look at if this data will be sufficient.

For the assignment, 1 year of tweets is required on a certain topic

In [18]:
# reading in the dataset downloaded from Kaggle, which is stored in my assignment folder
test_df = pd.read_csv('/Users/adevane/Documents/AdvDA&BigData-FinalCA/MSC_DA_CA2v4/Datasets/archive/tweets.csv')

In [19]:
test_df.head()# showing first 5 rows of dataset

Unnamed: 0,tweetUrl,date,renderedContent,tweetId,userId,replyCount,retweetCount,likeCount,quoteCount,source,media,retweetedTweet,quotedTweet,mentionedUsers
0,https://twitter.com/ShashiRajbhar6/status/1376...,2021-03-30 03:33:46+00:00,Support 👇\n\n#FarmersProtest,1.376739e+18,1.01597e+18,0,0,0,0,"<a href=""http://twitter.com/download/android"" ...",,,,
1,https://twitter.com/kaursuk06272818/status/137...,2021-03-30 03:33:23+00:00,Supporting farmers means supporting our countr...,1.376739e+18,1.332937e+18,0,0,0,0,"<a href=""http://twitter.com/download/android"" ...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
2,https://twitter.com/kaursuk06272818/status/137...,2021-03-30 03:31:00+00:00,Support farmers if you are related to food #St...,1.376739e+18,1.332937e+18,0,0,0,0,"<a href=""http://twitter.com/download/android"" ...",[{'previewUrl': 'https://pbs.twimg.com/media/E...,,,
3,https://twitter.com/SukhdevSingh_/status/13767...,2021-03-30 03:30:45+00:00,#StopHateAgainstFarmers support #FarmersProtes...,1.376739e+18,1.308357e+18,0,1,3,0,"<a href=""http://twitter.com/download/android"" ...",,,,
4,https://twitter.com/Davidmu66668113/status/137...,2021-03-30 03:30:30+00:00,"You hate farmers I hate you, \nif you love the...",1.376739e+18,1.357312e+18,0,0,1,0,"<a href=""http://twitter.com/download/android"" ...",,,,


In [20]:
test_df.shape# simple print of the total rows and columns in the dataset

#looks usuable , 14 unqiue columns, with over 1 million rows ( tweets )

(1084452, 14)

Assignment requires we have 1 full year of data, lets create a simple check to do this

In [21]:
# Check the maximum and minimum values of the 'date' column
end_of_tweets = test_df['date'].max()
start_of_tweets = test_df['date'].min()

print('In this dataset we have tweets from: ', start_of_tweets,' until : ', end_of_tweets)

In this dataset we have tweets from:  2020-11-01 03:36:57+00:00  until :  2021-11-21 05:36:30+00:00


This looks to be sufficent for our assignment criteria as there is 1 year and 20+ days here.

For now, lets close this section, next steps is to decide on which datasets to use