In [2]:
from __future__ import division, print_function, absolute_import, unicode_literals

import os
from decimal import Decimal
from traceback import print_exc

import pandas as pd

DATA_PATH = os.path.abspath(os.path.join('..', '..', '..', 'Data'))
import gzip
import matplotlib
%matplotlib inline
from IPython.display import display, HTML 
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 200)

In [3]:
# this should load 200k tweets in about a minute
print('Loading tweets (could take a minute or so)...')
df = pd.read_csv(os.path.join(DATA_PATH, 'all_tweets.csv'), index_col='id', low_memory=True,
                 quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)
# in iPython Notebook print out df.columns to show that many of them contain dots
# rename the columns to be attribute-name friendly
df.columns = [label.replace('.', '_') for label in df.columns]
print('Done.')


Loading tweets (could take a minute or so)...


  interactivity=interactivity, compiler=compiler, result=result)


Done.


In [4]:
print('The raw table shape is {}'.format(df.shape))
nonnull_rows = 330
nonnull_cols = 50
df = df.dropna(axis=1, thresh=nonnull_rows)
print('After dropping columns with fewer than {} nonnull values, the table shape is {}'.format(nonnull_rows, df.shape))
df = df.dropna(axis=0, thresh=nonnull_cols)
print('After dropping rows with fewer than {} nonnull values, the table shape is {}'.format(nonnull_cols, df.shape))


# in ipython notebook, explore and describe the DataFrame columns
print('Of the {} columns, {} are actually DataFrames'.format(len(df.columns), sum([not isinstance(df[col], pd.Series) for col in df.columns])))
# remove dataframes with only 2 columns and one is the _str of the other:
for col in df.columns:
    if isinstance(df[col], pd.DataFrame):
        print('Column {} is a {}-wide DataFrame'.format(col, len(df[col].columns)))
        if df[col].columns[1] == df[col].columns[0] + '_str':
            print('Column {} looks easy because it has sub-columns {}'.format(col, df[col].columns))
            df[col] = df[col][df[col].columns[1]]
        else:
            try:
                assert(float(df[col].iloc[:, 0].max()) == float(df[col].iloc[:, 1].max()))
                df[col] = df[col].fillna(-1, inplace=False)
                series = pd.Series([int(Decimal(x)) for x in df[col].iloc[:, 1].values]).astype('int64').copy()
                del df[col]
                df[col] = series
                print('Finished converting column {} to type {}({})'.format(col, type(df[col]), df[col].dtype))
            except:
                print_exc()

print('Of the {} columns, {} are still DataFrames after trying to convert both columns to long integers'.format(
    len(df.columns), sum([not isinstance(df[col], pd.Series) for col in df.columns])))

The raw table shape is (200168, 285)
After dropping columns with fewer than 330 nonnull values, the table shape is (200168, 285)
After dropping rows with fewer than 50 nonnull values, the table shape is (193378, 285)
Of the 285 columns, 8 are actually DataFrames
Column quoted_status_id is a 2-wide DataFrame
Finished converting column quoted_status_id to type <class 'pandas.core.series.Series'>(float64)
Column quoted_status_id_str is a 2-wide DataFrame
Finished converting column quoted_status_id_str to type <class 'pandas.core.series.Series'>(float64)
Column retweeted_status_quoted_status_id is a 2-wide DataFrame
Finished converting column retweeted_status_quoted_status_id to type <class 'pandas.core.series.Series'>(float64)
Column retweeted_status_quoted_status_id_str is a 2-wide DataFrame
Finished converting column retweeted_status_quoted_status_id_str to type <class 'pandas.core.series.Series'>(float64)
Of the 281 columns, 0 are still DataFrames after trying to convert both columns t

In [5]:
print('df.describe() stats:')
desc = df.describe()
for col, stats in desc.T.iterrows():
    print('')
    print('{} ({})'.format(col, df[col].dtype if isinstance(df[col], pd.Series) else type(df[col])))
    print(stats)

df.describe() stats:





favorite_count (int64)
count    193378.000000
mean          0.629679
std           6.251319
             ...      
50%           0.000000
75%           0.000000
max        1165.000000
Name: favorite_count, dtype: float64

id_str (int64)
count    1.933780e+05
mean     7.274888e+17
std      3.778481e+15
             ...     
50%      7.271970e+17
75%      7.301318e+17
max      7.345639e+17
Name: id_str, dtype: float64

in_reply_to_status_id (float64)
count    1.116500e+04
mean     7.270313e+17
std      1.485877e+16
             ...     
50%               NaN
75%               NaN
max      7.345608e+17
Name: in_reply_to_status_id, dtype: float64

in_reply_to_status_id_str (float64)
count    1.116500e+04
mean     7.270313e+17
std      1.485877e+16
             ...     
50%               NaN
75%               NaN
max      7.345608e+17
Name: in_reply_to_status_id_str, dtype: float64

in_reply_to_user_id (float64)
count    1.300700e+04
mean     2.245103e+16
std      1.249313e+17
            

In [6]:
df

Unnamed: 0_level_0,coordinates_coordinates,coordinates_type,created_at,entities_hashtags,entities_media,entities_symbols,entities_urls,entities_user_mentions,favorite_count,favorited,geo_coordinates,geo_type,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,lat,lon,metadata_iso_language_code,metadata_result_type,place_bounding_box_coordinates,place_bounding_box_type,place_contained_within,place_country,place_country_code,place_full_name,place_id,place_name,place_place_type,place_url,possibly_sensitive,quoted_status_created_at,quoted_status_entities_hashtags,quoted_status_entities_media,quoted_status_entities_symbols,quoted_status_entities_urls,quoted_status_entities_user_mentions,quoted_status_favorite_count,quoted_status_favorited,quoted_status_is_quote_status,quoted_status_lang,quoted_status_metadata_iso_language_code,quoted_status_metadata_result_type,quoted_status_possibly_sensitive,quoted_status_retweet_count,quoted_status_retweeted,quoted_status_source,quoted_status_text,quoted_status_truncated,quoted_status_user_contributors_enabled,quoted_status_user_created_at,quoted_status_user_default_profile,quoted_status_user_default_profile_image,quoted_status_user_description,quoted_status_user_entities_description_urls,quoted_status_user_entities_url_urls,quoted_status_user_favourites_count,quoted_status_user_followers_count,quoted_status_user_friends_count,quoted_status_user_geo_enabled,quoted_status_user_has_extended_profile,quoted_status_user_id,quoted_status_user_id_str,quoted_status_user_is_translation_enabled,quoted_status_user_is_translator,quoted_status_user_lang,quoted_status_user_listed_count,quoted_status_user_location,quoted_status_user_name,quoted_status_user_profile_background_color,quoted_status_user_profile_background_image_url,quoted_status_user_profile_background_image_url_https,quoted_status_user_profile_background_tile,quoted_status_user_profile_banner_url,quoted_status_user_profile_image_url,quoted_status_user_profile_image_url_https,quoted_status_user_profile_link_color,quoted_status_user_profile_sidebar_border_color,quoted_status_user_profile_sidebar_fill_color,quoted_status_user_profile_text_color,quoted_status_user_profile_use_background_image,quoted_status_user_protected,quoted_status_user_screen_name,quoted_status_user_statuses_count,quoted_status_user_time_zone,quoted_status_user_url,quoted_status_user_utc_offset,quoted_status_user_verified,retweet_count,retweeted,retweeted_status_created_at,retweeted_status_entities_hashtags,retweeted_status_entities_media,retweeted_status_entities_symbols,retweeted_status_entities_urls,retweeted_status_entities_user_mentions,...,retweeted_status_quoted_status_user_profile_sidebar_fill_color,retweeted_status_quoted_status_user_profile_text_color,retweeted_status_quoted_status_user_profile_use_background_image,retweeted_status_quoted_status_user_protected,retweeted_status_quoted_status_user_screen_name,retweeted_status_quoted_status_user_statuses_count,retweeted_status_quoted_status_user_time_zone,retweeted_status_quoted_status_user_url,retweeted_status_quoted_status_user_utc_offset,retweeted_status_quoted_status_user_verified,retweeted_status_retweet_count,retweeted_status_retweeted,retweeted_status_source,retweeted_status_text,retweeted_status_truncated,retweeted_status_user_contributors_enabled,retweeted_status_user_created_at,retweeted_status_user_default_profile,retweeted_status_user_default_profile_image,retweeted_status_user_description,retweeted_status_user_entities_description_urls,retweeted_status_user_entities_url_urls,retweeted_status_user_favourites_count,retweeted_status_user_followers_count,retweeted_status_user_friends_count,retweeted_status_user_geo_enabled,retweeted_status_user_has_extended_profile,retweeted_status_user_id,retweeted_status_user_id_str,retweeted_status_user_is_translation_enabled,retweeted_status_user_is_translator,retweeted_status_user_lang,retweeted_status_user_listed_count,retweeted_status_user_location,retweeted_status_user_name,retweeted_status_user_profile_background_color,retweeted_status_user_profile_background_image_url,retweeted_status_user_profile_background_image_url_https,retweeted_status_user_profile_background_tile,retweeted_status_user_profile_banner_url,retweeted_status_user_profile_image_url,retweeted_status_user_profile_image_url_https,retweeted_status_user_profile_link_color,retweeted_status_user_profile_sidebar_border_color,retweeted_status_user_profile_sidebar_fill_color,retweeted_status_user_profile_text_color,retweeted_status_user_profile_use_background_image,retweeted_status_user_protected,retweeted_status_user_screen_name,retweeted_status_user_statuses_count,retweeted_status_user_time_zone,retweeted_status_user_url,retweeted_status_user_utc_offset,retweeted_status_user_verified,source,text,truncated,user_contributors_enabled,user_created_at,user_default_profile,user_default_profile_image,user_description,user_entities_description_urls,user_entities_url_urls,user_favourites_count,user_followers_count,user_friends_count,user_geo_enabled,user_has_extended_profile,user_id,user_id_str,user_is_translation_enabled,user_is_translator,user_lang,user_listed_count,user_location,user_name,user_profile_background_color,user_profile_background_image_url,user_profile_background_image_url_https,user_profile_background_tile,user_profile_banner_url,user_profile_image_url,user_profile_image_url_https,user_profile_link_color,user_profile_sidebar_border_color,user_profile_sidebar_fill_color,user_profile_text_color,user_profile_use_background_image,user_protected,user_screen_name,user_statuses_count,user_time_zone,user_url,user_utc_offset,user_verified,quoted_status_id,quoted_status_id_str,retweeted_status_quoted_status_id,retweeted_status_quoted_status_id_str
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1
731122251278499841,,,Fri May 13 14:01:42 +0000 2016,"[{u'indices': [47, 52], u'text': u'Java'}, {u'...","[{u'source_user_id': 150820027, u'source_statu...",[],"[{u'url': u'https://t.co/SVgMAwNxxj', u'indice...","[{u'indices': [3, 17], u'id_str': u'150820027'...",0,False,,,731122251278499841,,,,,,False,en,,,en,recent,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14,False,Fri May 13 13:30:47 +0000 2016,"[{u'indices': [28, 33], u'text': u'Java'}, {u'...",[{u'expanded_url': u'http://twitter.com/javaco...,[],"[{u'url': u'https://t.co/SVgMAwNxxj', u'indice...",[],...,,,,,,,,,,,14.0,False,"<a href=""http://bufferapp.com"" rel=""nofollow"">...","Top Performance Metrics for #Java, .NET, #PHP,...",False,False,Tue Jun 01 22:38:53 +0000 2010,False,False,Java developers resource center. JCGs is one o...,[],"[{u'url': u'http://t.co/DivczES801', u'indices...",0.0,90268.0,130.0,False,False,150820027.0,150820027.0,False,False,en,1717.0,,Java Code Geeks,ACDED6,http://abs.twimg.com/images/themes/theme18/bg.gif,https://abs.twimg.com/images/themes/theme18/bg...,False,https://pbs.twimg.com/profile_banners/15082002...,http://pbs.twimg.com/profile_images/2928906892...,https://pbs.twimg.com/profile_images/292890689...,038543,EEEEEE,F6F6F6,333333,True,False,javacodegeeks,37567.0,Athens,http://t.co/DivczES801,10800.0,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @javacodegeeks: Top Performance Metrics for...,False,False,Wed Aug 12 15:20:38 +0000 2009,False,False,"Husband, Father, Programmer, Gamer, Graphic De...",[],,845,221,709,False,False,65061698,65061698,False,False,en,8,,Greg Herhuth,000000,http://abs.twimg.com/images/themes/theme9/bg.gif,https://abs.twimg.com/images/themes/theme9/bg.gif,False,https://pbs.twimg.com/profile_banners/65061698...,http://pbs.twimg.com/profile_images/7228456300...,https://pbs.twimg.com/profile_images/722845630...,3B94D9,000000,000000,000000,False,False,zamajam,579,Eastern Time (US & Canada),,-14400.0,False,,,,
724281574129180672,,,Sun Apr 24 16:59:18 +0000 2016,[],,[],"[{u'url': u'https://t.co/HshSAeTMYc', u'indice...",[],0,False,,,724281574129180672,,,,,,False,en,,,en,recent,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,False,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<a href=""http://twitterfeed.com"" rel=""nofollow...",World's Largest Python Discovered in Nepal: WA...,False,False,Tue Mar 24 14:13:53 +0000 2015,True,False,,[],"[{u'url': u'http://t.co/mkBfH8QmsX', u'indices...",0,776,1910,True,False,3110463964,3110463964,False,False,en,4,"Lokoja, Kogi State, Nigeria.",Ukpe Thompson,C0DEED,http://abs.twimg.com/images/themes/theme1/bg.png,https://abs.twimg.com/images/themes/theme1/bg.png,False,https://pbs.twimg.com/profile_banners/31104639...,http://pbs.twimg.com/profile_images/5852217706...,https://pbs.twimg.com/profile_images/585221770...,0084B4,C0DEED,DDEEF6,333333,True,False,newsymag,2159,Pacific Time (US & Canada),http://t.co/mkBfH8QmsX,-25200.0,False,,,,
724281535587856384,,,Sun Apr 24 16:59:09 +0000 2016,"[{u'indices': [109, 119], u'text': u'developer...",,[],"[{u'url': u'https://t.co/m2mVrc3RrQ', u'indice...","[{u'indices': [23, 34], u'screen_name': u'Expe...",0,False,,,724281535587856384,,,,,,False,en,,,en,recent,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,False,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<a href=""http://bufferapp.com"" rel=""nofollow"">...",🎷 💎 STOP! Could you be @ExpendTeam's Python / ...,False,False,Sun Mar 10 22:59:25 +0000 2013,False,False,THE best place to find a job at a startup. Bro...,[],"[{u'url': u'http://t.co/1ZmWTfHGbb', u'indices...",105,2615,591,False,False,1258138550,1258138550,True,False,en,966,"London, UK",Unicorn Hunt,C0DEED,http://pbs.twimg.com/profile_background_images...,https://pbs.twimg.com/profile_background_image...,True,https://pbs.twimg.com/profile_banners/12581385...,http://pbs.twimg.com/profile_images/5611581146...,https://pbs.twimg.com/profile_images/561158114...,0084B4,FFFFFF,DDEEF6,333333,True,False,unicornhuntio,14193,London,http://t.co/1ZmWTfHGbb,3600.0,False,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724275609858392066,,,Sun Apr 24 16:35:36 +0000 2016,"[{u'indices': [77, 84], u'text': u'python'}, {...",,[],"[{u'indices': [53, 76], u'url': u'https://t.co...","[{u'indices': [3, 14], u'screen_name': u'RealP...",0,False,,,724275609858392066,,,,,,False,en,,,en,recent,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,False,Sun Apr 24 16:07:50 +0000 2016,"[{u'indices': [61, 68], u'text': u'python'}, {...",,[],"[{u'indices': [37, 60], u'url': u'https://t.co...",[],...,,,,,,,,,,,6.0,False,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",List of Python API Wrappers &gt;&gt; https://t...,False,False,Wed Aug 08 20:44:42 +0000 2012,True,False,Teaching practical programming through real-wo...,[],"[{u'indices': [0, 22], u'url': u'http://t.co/Z...",23339.0,5678.0,39.0,False,False,745911914.0,745911914.0,False,False,en,682.0,,RealPython.com,C0DEED,http://abs.twimg.com/images/themes/theme1/bg.png,https://abs.twimg.com/images/themes/theme1/bg.png,False,,http://pbs.twimg.com/profile_images/3312468366...,https://pbs.twimg.com/profile_images/331246836...,0084B4,C0DEED,DDEEF6,333333,True,False,RealPython,11640.0,Atlantic Time (Canada),http://t.co/ZhoOQdg9HF,-10800.0,False,"<a href=""http://digihub.wales"" rel=""nofollow"">...",RT @RealPython: List of Python API Wrappers &g...,False,False,Mon Aug 17 22:36:39 +0000 2015,True,False,#SocialMedia #SocialEnterprise Connecting Wal...,[],"[{u'indices': [0, 22], u'url': u'http://t.co/0...",161,867,2,False,False,3428977365,3428977365,False,False,en-gb,1514,"Colwyn Bay, Wales",DigiHub Wales,C0DEED,http://abs.twimg.com/images/themes/theme1/bg.png,https://abs.twimg.com/images/themes/theme1/bg.png,False,https://pbs.twimg.com/profile_banners/34289773...,http://pbs.twimg.com/profile_images/6368923955...,https://pbs.twimg.com/profile_images/636892395...,0084B4,C0DEED,DDEEF6,333333,True,False,DigiHubWales,10422,,http://t.co/0OZQFZaZ03,,False,,,,
724275578879111169,,,Sun Apr 24 16:35:29 +0000 2016,[],,[],"[{u'indices': [26, 49], u'url': u'https://t.co...",[],0,False,,,724275578879111169,,,,,,False,en,,,en,recent,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,False,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<a href=""https://path.com/"" rel=""nofollow"">Pat...",Watching Boa vs. Python — https://t.co/5THbrirfQO,False,False,Wed Oct 05 01:11:53 +0000 2011,False,False,"| Vocal @MEMORIES_MTL | Di Doan Ibu, ku Dengar...",[],"[{u'indices': [0, 22], u'url': u'http://t.co/j...",105,819,275,True,False,385181009,385181009,False,False,id,1,PLBNG - MGL,ﺳﻮﺭﻳﺎ,020305,http://pbs.twimg.com/profile_background_images...,https://pbs.twimg.com/profile_background_image...,True,https://pbs.twimg.com/profile_banners/38518100...,http://pbs.twimg.com/profile_images/7056528218...,https://pbs.twimg.com/profile_images/705652821...,2FC2EF,000000,252429,666666,True,False,bismillah____,59510,Bangkok,http://t.co/jgsHtjOt6x,25200.0,False,,,,
724275568871673857,,,Sun Apr 24 16:35:26 +0000 2016,[],,[],"[{u'indices': [115, 138], u'url': u'https://t....",[],0,False,,,724275568871673857,,,,,,False,ru,,,ru,recent,,,,,,,,,,,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,False,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<a href=""http://vk.com"" rel=""nofollow"">vk.com ...",Чертова дюжина вакансий в IT и Digital / / 1....,False,False,Sun May 22 03:29:30 +0000 2011,False,False,,[],,1,61,15,False,False,302987528,302987528,False,False,ru,4,Rus,Alex Birgazov,C0DEED,http://pbs.twimg.com/profile_background_images...,https://pbs.twimg.com/profile_background_image...,False,https://pbs.twimg.com/profile_banners/30298752...,http://pbs.twimg.com/profile_images/1364034429...,https://pbs.twimg.com/profile_images/136403442...,0084B4,FFFFFF,DDEEF6,333333,True,False,weelman93,124,Irkutsk,,28800.0,False,,,,


In [7]:
import pandas as pd
np = pd.np
stats = df.describe()
columns = [c for c in stats.columns if stats[c]['count'] > 10000 or 'fav' in c or 'retweet' in c or df[c].dtype in (int, float, np.float64)] + ['text', 'favorite_count', 'geo_coordinates']
print(df.shape)
print(df[columns].shape)
for c in columns:
    print(c)
df.text



(193378, 281)
(193378, 60)
favorite_count
id_str
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
lat
lon
quoted_status_favorite_count
quoted_status_retweet_count
quoted_status_user_favourites_count
quoted_status_user_followers_count
quoted_status_user_friends_count
quoted_status_user_id
quoted_status_user_id_str
quoted_status_user_listed_count
quoted_status_user_statuses_count
quoted_status_user_utc_offset
retweet_count
retweeted_status_favorite_count
retweeted_status_id
retweeted_status_id_str
retweeted_status_in_reply_to_status_id
retweeted_status_in_reply_to_status_id_str
retweeted_status_in_reply_to_user_id
retweeted_status_in_reply_to_user_id_str
retweeted_status_quoted_status_favorite_count
retweeted_status_quoted_status_retweet_count
retweeted_status_quoted_status_user_favourites_count
retweeted_status_quoted_status_user_followers_count
retweeted_status_quoted_status_user_friends_count
retweeted_status_quoted_status_user_id
retweeted_s

id
731122251278499841    RT @javacodegeeks: Top Performance Metrics for...
724281574129180672    World's Largest Python Discovered in Nepal: WA...
724281535587856384    🎷 💎 STOP! Could you be @ExpendTeam's Python / ...
                                            ...                        
724275609858392066    RT @RealPython: List of Python API Wrappers &g...
724275578879111169    Watching Boa vs. Python — https://t.co/5THbrirfQO
724275568871673857    Чертова дюжина вакансий в IT и Digital /  / 1....
Name: text, dtype: object

In [None]:
df[columns].to_csv(os.path.join(DATA_PATH, 'cleaned_tweets.csv.gz'), compression='gzip', encoding='UTF-8', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)

In [9]:
rawlen = len(df)
df.drop_duplicates('id_str', keep='last', inplace=True)
rawlen - len(df)

10308

In [10]:
df[columns].to_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), compression='gzip', encoding='UTF-8', quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC)