In [1]:
import pandas as pd
import os
from zipfile import ZipFile

<h2> Unzipping Month's Tweet Files </h2>

In [15]:
def unzipping(directory):
    for zipFile in os.listdir(directory):
        filePath=directory+"//"+zipFile
        zf = ZipFile(filePath, 'r')
        zf.extractall(directory)
        zf.close()
    print("Done.")

In [None]:
unzipping("July")

<h2> Delete Zip Files After Extraction </h2>

In [19]:
def delete_zip_files(directory):
    for zipFile in os.listdir(directory):
        if zipFile.endswith(".zip"):
            pathToFile=os.path.join(directory, zipFile)
            os.remove(pathToFile)
    print("Done.")

In [20]:
delete_zip_files("July")

Done.


<h2> Extract a fraction from each File, then Merge all  files into one Sample File </h2>

In [22]:
def sampling_merge(directory):
    dfs=pd.DataFrame()
    for filename in os.listdir(directory):
        path=os.path.join(directory, filename)
        df=pd.read_csv(path, header=None)
        df_sample=df.sample(frac=0.01)
        dfs=dfs.append(df_sample)
    dfs.to_csv(directory+"//"+directory+"_sample.csv", index=False, header=None)
    print("Done.")

In [23]:
sampling_merge("July")

Done.


<h2> Before Hydration: Extract Tweet ids only from Tweet Files </h2>

In [25]:
def extract_tweet_id(dir, fileName):
    filePath=dir+"//"+fileName
    df = pd.read_csv(filePath, header=None)
    df = df[0]
    outFile=dir+"//ready_"+fileName
    df.to_csv(outFile, index= False, header=None)
    print("Extraction Done. Ready to Hydrate")

In [26]:
extract_tweet_id("July", "July_sample.csv")


Extraction Done. Ready to Hydrate


<h2> Filter after Hydration Complete </h2>

In [6]:
def month_filtering(monthDirectory, monthHydratedFile):
    path = monthDirectory+"//"+monthHydratedFile+".csv"
    df_month = pd.read_csv(path)
    df_filtered= df_month[(df_month['retweet_count']>0) | (df_month['favorite_count']>0)]
    df_filtered.to_csv(path+"_filtered.csv", index=False)
    print("Done filtering.")
    df_sorted = df_filtered.sort_values(by=['retweet_count'], ascending= False)
    df_sorted.to_csv(path+"_filtered_sorted.csv", index=False)
    print("Done sorting.")
    df_slice = df_sorted[:100000]
    df_slice.to_csv(path+"_filtered_sorted_sliced.csv", index=False)
    print("Done slicing.")

In [7]:
month_filtering("July","ready_July_sample_hydrated")


Done filtering.
Done sorting.
Done slicing.


<h4> Just in Case the Month sample is too large, Then before Hydration -- >Take a subset from it </h4>

In [7]:
# sample from the total month into 4 quarters
def month_subsetting_before_hydration(monthDirectory, monthFile):
    path = monthDirectory+"//"+monthFile
    df_month = pd.read_csv(path)
    chunk_size = int(df_month.shape[0]/4)
    i = 1
    for start in range(0, df_month.shape[0], chunk_size):
        df_subset= df_month.iloc[start: start + chunk_size]
        df_subset.to_csv(monthDirectory+"//_subset_"+str(i)+".csv", index=False)
        i += 1
    print("Done.")

In [8]:
month_subsetting_before_hydration("June","June_sample.csv")

Done.


<h4> Just in Case the Month file is too large, After Hydration --> divide and filter </h4>

In [2]:
# sample from the total month
def month_subsetting_after_hydration(monthDirectory, monthHydratedFile):
    path = monthDirectory+"//"+monthHydratedFile
    df_month = pd.read_csv(path)
    chunk_size = int(df_month.shape[0]/4)
    i = 1
    for start in range(0, df_month.shape[0], chunk_size):
        df_subset= df_month.iloc[start: start + chunk_size]
        df_subset= df_subset[(df_subset['retweet_count']>0) | (df_subset['favorite_count']>0)]
        df_subset.to_csv(monthDirectory+"//_subset_"+str(i)+".csv", index=False)
        i += 1

In [7]:
df_month.columns

Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'retweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')