In [2]:
import pandas as pd
import pandas_profiling
import json
import numpy
from pprint import pprint as p

fn = "tweets_01-08-2021.json"

with open(fn, encoding='utf-8') as f:
    df = pd.read_json(f)
    
df.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916600,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070300,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820600,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015600,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554898000,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [19]:
# Initialize new columns to false
features = [
    'isLie', 
    'isOpposite', 
    'isPreemptive', 
    'isElection', 
    'isIKnowYouAreButWhatAmI', 
    'isRacist', 
    'isHitler', 
    'isPresident',
    'isRussia',
    'isUkraine',
    'isNuke',
    'isDeathStar',
    'isExecutivePrivilege',
    'isBusiness',
    'isPersonal',
    'isSmear',
    'isSexist',
    'isCelebrity',
    'isPentagon',
    'isNickname',
    'isXenophobic',
    'isMAGA',
    'isReligious',
    'isPandemic',
    'isAllCaps',
    'isRINO',
    'isChina',
    'isFirstImpeachment',
    'isSecondImpeachment',
    'isSoTrue',
    'isInTwoWeeks'
]

df[features] = 'f'


In [20]:
# Load labels from more_features.json

with open('more_features.json') as f:
    feature_dict = json.load(f)
                                  
for k, v in feature_dict.items():
    idx = df.loc[df['id'] == int(k)].index[0]
    for feature in v:
        df.loc[idx, feature] = 't'

label_df = df.loc[df['id'].isin(feature_dict.keys())]

print(df['isLie'].str.contains('t').sum())
print(df['isSmear'].str.contains('t').sum())
print((df['isLie'].str.contains('t') & df['isElection'].str.contains('t')).sum())

label_df.head()


175
98
146


Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged,isLie,...,isPandemic,isAllCaps,isRINO,isChina,isFirstImpeachment,isSecondImpeachment,isSoTrue,isInTwoWeeks,isNickame,isNickanme
3,1304875170860015600,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f,t,...,f,f,f,f,f,f,f,f,,
13,1225835449379258400,RT @TomFitton: Vindman's behavior is a scandal...,t,f,Twitter for iPhone,0,7679,2020-02-07 17:35:20,f,f,...,f,f,f,f,f,f,f,f,,
16,1319488376202879000,RT @marklevinshow: President Trump was outstan...,t,f,Twitter for iPhone,0,21259,2020-10-23 03:58:38,f,f,...,f,f,f,f,f,f,f,f,,
19,1325884977112883200,The threshold identification of Ballots is tur...,f,f,Twitter for iPhone,493076,100609,2020-11-09 19:36:26,f,t,...,f,f,f,f,f,f,f,f,,
20,1315779944002199600,"“I’m running as a proud Democrat, for the Sena...",f,f,Twitter for iPhone,142084,32953,2020-10-12 22:22:39,f,f,...,f,f,f,f,f,f,f,f,,


In [21]:
# Retain text, id, date, device
text_df = label_df.loc[:, ['id', 'text', 'date', 'device']]

# Drop columns not represented in the data, except twitter columns.
twitter_columns = [
    'isRetweet', 
    'isDeleted', 
    'favorites', 
    'retweets', 
    'isFlagged'
]

feature_set = set([item for val in feature_dict.values() for item in val])
valid_features = list((set(features).intersection(feature_set)) | set(twitter_columns))

# Drop invalid columns
drop_columns = set(label_df.columns).difference(set(valid_features))
label_df.drop(list(drop_columns), axis=1, inplace=True)
print(label_df.columns.values)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [26]:
# Convert T/F to binary

def map_fn(tf_str):
    if tf_str == 't':
        return 1
    else:
        if tf_str == 'f':
            return 0
        else:
            return tf_str
    
for col in label_df:
    label_df[col] = label_df[col].map(map_fn)
    
label_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df[col] = label_df[col].map(map_fn)


Unnamed: 0,isRetweet,isDeleted,favorites,retweets,isFlagged,isLie,isOpposite,isPreemptive,isElection,isIKnowYouAreButWhatAmI,...,isPresident,isRussia,isUkraine,isSmear,isCelebrity,isPentagon,isNickname,isXenophobic,isReligious,isPandemic
3,0,0,80527,23502,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
13,1,0,0,7679,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
16,1,0,0,21259,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
19,0,0,493076,100609,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
20,0,0,142084,32953,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [27]:
label_df.profile_report()


HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=36.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




