This notebook contains helper methods for combining tweet texts from multiple CSV files into one CSV file or pandas DataFrame.

In [1]:
import io
import csv
import pandas as pd
import numpy as np

In [2]:
root_dir = "../"

In [3]:
sources = {
    "Cheng-Caverlee-Lee" : {
        "URL":"https://archive.org/details/twitter_cikm_2010",
        "filenames" : [
            "Tweets/Cheng-Caverlee-Lee/twitter_cikm_2010/all_tweets.txt"
        ]
    },
    "Russian-Troll-Tweets" : {
        "URL" : "https://github.com/fivethirtyeight/russian-troll-tweets/",
        "filenames" : [
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_1.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_2.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_3.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_4.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_5.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_6.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_7.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_8.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_9.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_10.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_11.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_12.csv",
            "Tweets/Russian-Troll-Tweets/russian-troll-tweets-master/IRAhandle_tweets_13.csv"
        ]
    },
    "Sentiment140" : {
        "URL" : "http://help.sentiment140.com/for-students/",
        "filenames" : [
            "Tweets/Sentiment140/trainingandtestdata/all_tweets.csv"
        ]
    },
    "GWU-TweetSets" : {
        "115th-Congress-Reps" : {
            "URL" : "https://tweetsets.library.gwu.edu/dataset/6d30e16b",
            "filenames" : [
                "Tweets/GWU-TweetSets/115th-Congress-Reps/all_tweets.csv"
            ]
        },
        "115th-Congress-Senators" : {
            "URL" : "http://tweetsets.library.gwu.edu/dataset/f3a5b1b2",
            "filenames" : [
                "Tweets/GWU-TweetSets/115th-Congress-Senators/all_tweets.csv"
            ]
        },
        "Charlottesville" : {
            "URL" : "https://tweetsets.library.gwu.edu/dataset/2cfbd74b",
            "filenames" : [
                "Tweets/GWU-TweetSets/Charlottesville/all_tweets.csv"
            ]
        },
        "Climate-Change" : {
            "URL" : "http://tweetsets.library.gwu.edu/dataset/19627add",
            "filenames" : [
                "Tweets/GWU-TweetSets/Climate-Change/tweets.csv",
                "Tweets/GWU-TweetSets/Climate-Change/tweets1207.csv",
                "Tweets/GWU-TweetSets/Climate-Change/tweets2107.csv",
                "Tweets/GWU-TweetSets/Climate-Change/tweets3007.csv"
            ]
        },
        "Healthcare-Filter" : {
            "URL" : "http://tweetsets.library.gwu.edu/dataset/4d53eca6",
            "filenames" : [
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets425.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets1325.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets2225.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets3125.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets4025.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets4925.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets5825.csv",
                "Tweets/GWU-TweetSets/Healthcare-Filter/tweets6725.csv"
            ]
        },
        "Healthcare-Search" : {
            "URL" : "http://tweetsets.library.gwu.edu/dataset/c3c06860",
            "filenames" : [
                "Tweets/GWU-TweetSets/Healthcare-Search/all_tweets.csv",
            ]
        },
        "US-Government" : {
            "URL" : "http://tweetsets.library.gwu.edu/dataset/839d1dd5",
            "filenames" : [
                "Tweets/GWU-TweetSets/US-Government/all_tweets.csv",
            ]
        }
    }
}

In [4]:
def get_cheng_caverlee_lee_df():
    source_name = "Cheng-Caverlee-Lee"
    source_URL = sources[source_name]["URL"]
    filenames = sources[source_name]["filenames"]
    print(f"Extracting {source_name} data")

    csv_data_as_list = []
    with io.open( root_dir + filenames[0], mode="r", encoding="utf-8" ) as csv_data:
        for line in csv_data:
            try:
                line_as_list = line.encode('ascii', 'ignore').decode('ascii').split()
                tweet_id = line_as_list[1]
                text = " ".join( line_as_list[2:-2] )
                csv_data_as_list.append( [tweet_id, text] )
            except:
                continue

    df = pd.DataFrame(csv_data_as_list, columns=["Tweet ID", "Text"])
    df.set_index("Tweet ID", inplace=True)
    df.drop_duplicates(subset="Text", inplace=True)
    df["Source Name"] = source_name
    df["Source URL"] = source_URL
    return df

In [5]:
def get_russian_troll_tweets_df():
    source_name = "Russian-Troll-Tweets"
    source_URL = sources[source_name]["URL"]
    filenames = sources[source_name]["filenames"]
    print(f"Extracting {source_name} data")

    df = pd.concat( [ pd.read_csv(root_dir + filename) for filename in filenames ] )
    df.rename( columns={"tweet_id": "Tweet ID", "content": "Text"}, inplace=True )
    df.drop( df.columns.difference(["Tweet ID", "Text"]), 1, inplace=True )
    df.set_index("Tweet ID", inplace=True)
    df.drop_duplicates(subset="Text", inplace=True)
    df["Source Name"] = source_name
    df["Source URL"] = source_URL
    return df

In [6]:
def get_sentiment140_df():
    source_name = "Sentiment140"
    source_URL = sources[source_name]["URL"]
    filenames = sources[source_name]["filenames"]
    print(f"Extracting {source_name} data")

    tweet_texts = pd.read_csv( root_dir + filenames[0], header=None ).iloc[:,-1]
    df = pd.DataFrame({ "Tweet ID" : None, "Text" : tweet_texts })
    df.set_index("Tweet ID", inplace=True)
    df.drop_duplicates(subset="Text", inplace=True)
    df["Source Name"] = source_name
    df["Source URL"] = source_URL
    return df

In [7]:
def get_gwu_tweet_sets_df(source_name):
    source_URL = sources["GWU-TweetSets"][source_name]["URL"]
    filenames = sources["GWU-TweetSets"][source_name]["filenames"]
    print(f"Extracting {source_name} data")

    df = pd.concat( [ pd.read_csv(root_dir + filename) for filename in filenames ] )
    df.rename( columns={"id": "Tweet ID"}, inplace=True )
    df.set_index("Tweet ID", inplace=True)
    df.drop_duplicates(subset="Text", inplace=True)
    df["Source Name"] = source_name
    df["Source URL"] = source_URL
    return df

In [8]:
def get_115th_congress_reps_df():
    return get_gwu_tweet_sets_df("115th-Congress-Reps")

In [9]:
def get_115th_congress_senators_df():
    return get_gwu_tweet_sets_df("115th-Congress-Senators")

In [10]:
def get_charlottesville_df():
    return get_gwu_tweet_sets_df("Charlottesville")

In [11]:
def get_climate_change_df():
    return get_gwu_tweet_sets_df("Climate-Change")

In [12]:
def get_healthcare_filter_df():
    return get_gwu_tweet_sets_df("Healthcare-Filter")

In [13]:
def get_healthcare_search_df():
    return get_gwu_tweet_sets_df("Healthcare-Search")

In [14]:
def get_us_government_df():
    return get_gwu_tweet_sets_df("US-Government")

In [15]:
def get_all_tweets_from_source_files():
    print("Extracting all tweets from source files...")
    df = pd.concat([ get_cheng_caverlee_lee_df(), 
                     get_russian_troll_tweets_df(),
                     get_sentiment140_df(),
                     get_115th_congress_reps_df(),
                     get_115th_congress_senators_df(),
                     get_charlottesville_df(),
                     get_climate_change_df(),
                     get_healthcare_filter_df(),
                     get_healthcare_search_df(),
                     get_us_government_df()
                   ])
    df.drop_duplicates(subset="Text", inplace=True)
    df["Text"] = df["Text"].str.encode('ascii', 'ignore').str.decode('ascii')
    print("Done.")
    return df

In [16]:
def write_all_tweets_to_csv_file(tweets_df=None, filename="../Tweets/all_tweets_aggregated.csv"):
    if tweets_df is None:
        tweets_df = get_all_tweets_from_source_files()
    print("Exporting DataFrame to CSV file...")
    with open(filename, "w") as new_file:
        tweets_df.to_csv(new_file, sep="\t", quoting=csv.QUOTE_ALL, quotechar='"')
    print("Done.")

In [None]:
def write_all_shuffled_tweets_to_csv_file(tweets_df=None, filename="../Tweets/all_tweets_shuffled.csv"):
    if tweets_df is None:
        tweets_df = get_all_tweets_from_source_files()
    tweets_df.reset_index()
    shuffled_df_index = np.random.permutation(tweets_df.index)
    shuffled_df = tweets_df.loc[shuffled_df_index]
    shuffled_df.index.name = "tmp-index"
    shuffled_df = shuffled_df.reset_index().drop(columns="tmp-index").astype(str)
    print("Exporting shuffled DataFrame to CSV file...")
    with open(filename, "w") as new_file:
        shuffled_df.to_csv(new_file, sep="\t", quoting=csv.QUOTE_ALL, quotechar='"')
    print("Done.")