In [4]:
import os
import json
import sys 

import numpy as np
import pandas as pd

In [5]:
### TRUE if you are running a TEST scenario. If yes then read only one file
### Default if FALSE and read all the files

TEST = False

In [6]:
filenames = list()
dutch_tweets = pd.DataFrame()
for file in os.listdir('/home/ubuntu'):
    if "dutch_tweets_chunk" in file:
            filenames.append(file)
            print(f"Reading file...{file}")
            temp = pd.read_json(file)
            print(f"Shape of file read...{temp.shape}")
            dutch_tweets = pd.concat([dutch_tweets, temp])
            if TEST:
                break

Reading file...dutch_tweets_chunk1.json
Shape of file read...(27142, 23)
Reading file...dutch_tweets_chunk2.json
Shape of file read...(27130, 23)
Reading file...dutch_tweets_chunk5.json
Shape of file read...(27104, 23)
Reading file...dutch_tweets_chunk7.json
Shape of file read...(27234, 23)
Reading file...dutch_tweets_chunk9.json
Shape of file read...(27221, 23)
Reading file...dutch_tweets_chunk6.json
Shape of file read...(27026, 23)
Reading file...dutch_tweets_chunk3.json
Shape of file read...(27112, 23)
Reading file...dutch_tweets_chunk4.json
Shape of file read...(27217, 23)
Reading file...dutch_tweets_chunk0.json
Shape of file read...(27019, 23)
Reading file...dutch_tweets_chunk8.json
Shape of file read...(27137, 23)


In [7]:
dutch_tweets.columns

Index(['full_text', 'text_translation', 'created_at', 'screen_name',
       'description', 'desc_translation', 'weekofyear', 'weekday', 'day',
       'month', 'year', 'location', 'point_info', 'point', 'latitude',
       'longitude', 'altitude', 'province', 'hisco_standard', 'hisco_code',
       'industry', 'sentiment_pattern', 'subjective_pattern'],
      dtype='object')

In [8]:
# Create label column..using the following rule
# Converting the sentiment scores to pos, neg & neu
# pos > +0.1
# neg < -0.1
# -0.1 <=neu <= +0.1 

In [9]:
def label(sentiment_score):
    if sentiment_score > 0.1:
        return 'pos'
    elif sentiment_score < -0.1:
        return 'neg'
    else:
        return 'neu'

In [10]:
%%time
dutch_tweets['label'] = dutch_tweets['sentiment_pattern'].apply(lambda x: label(x))

CPU times: user 77.1 ms, sys: 0 ns, total: 77.1 ms
Wall time: 77.5 ms


In [11]:
dutch_tweets['label'].value_counts()

neu    157734
pos     71260
neg     42348
Name: label, dtype: int64

In [12]:
dutch_tweets.drop(['created_at'], inplace=True, axis=1)

In [13]:
dutch_tweets.columns

Index(['full_text', 'text_translation', 'screen_name', 'description',
       'desc_translation', 'weekofyear', 'weekday', 'day', 'month', 'year',
       'location', 'point_info', 'point', 'latitude', 'longitude', 'altitude',
       'province', 'hisco_standard', 'hisco_code', 'industry',
       'sentiment_pattern', 'subjective_pattern', 'label'],
      dtype='object')

In [14]:
dutch_tweets.dtypes

full_text              object
text_translation       object
screen_name            object
description            object
desc_translation       object
weekofyear            float64
weekday               float64
day                   float64
month                 float64
year                  float64
location               object
point_info             object
point                  object
latitude              float64
longitude             float64
altitude              float64
province               object
hisco_standard         object
hisco_code             object
industry                 bool
sentiment_pattern     float64
subjective_pattern    float64
label                  object
dtype: object

In [15]:
obj_columns = list(dutch_tweets.loc[:, dutch_tweets.dtypes == object].columns)
obj_columns

['full_text',
 'text_translation',
 'screen_name',
 'description',
 'desc_translation',
 'location',
 'point_info',
 'point',
 'province',
 'hisco_standard',
 'hisco_code',
 'label']

In [16]:
dutch_tweets[obj_columns] = dutch_tweets[obj_columns].astype('str')

In [17]:
dutch_tweets.isnull().sum()

full_text                  0
text_translation           0
screen_name                0
description                0
desc_translation           0
weekofyear                14
weekday                   20
day                       14
month                     20
year                      20
location                   0
point_info                 0
point                      0
latitude              136897
longitude             136897
altitude               16917
province                   0
hisco_standard             0
hisco_code                 0
industry                   0
sentiment_pattern          0
subjective_pattern         0
label                      0
dtype: int64

In [18]:
dutch_tweets[["latitude", "longitude", "altitude", "month", "year", "weekofyear", "weekday", "day"]] = dutch_tweets[["latitude", "longitude", "altitude", "month", "year","weekofyear", "weekday", "day"]].fillna(0.0)

In [19]:
dutch_tweets[["full_text", "text_translation", "screen_name", "description", "desc_translation", "location", "point_info", "point", "province", "hisco_standard", "hisco_code"]] =  dutch_tweets[["full_text", "text_translation", "screen_name", "description", "desc_translation", "location", "point_info", "point", "province", "hisco_standard", "hisco_code"]].fillna(" ")

In [20]:
dutch_tweets[["latitude", "longitude", "altitude", "weekofyear", "weekday", "day"]].isnull().sum()

latitude      0
longitude     0
altitude      0
weekofyear    0
weekday       0
day           0
dtype: int64

In [21]:
dutch_tweets.isnull().sum()

full_text             0
text_translation      0
screen_name           0
description           0
desc_translation      0
weekofyear            0
weekday               0
day                   0
month                 0
year                  0
location              0
point_info            0
point                 0
latitude              0
longitude             0
altitude              0
province              0
hisco_standard        0
hisco_code            0
industry              0
sentiment_pattern     0
subjective_pattern    0
label                 0
dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, **options)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_s

In [24]:
train_x, test_x, train_y, test_y = train_test_split(dutch_tweets.drop(["label"], axis=1), dutch_tweets['label'], test_size=0.4, random_state=42 )

In [25]:
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)

(162805, 22) (162805,)
(108537, 22) (108537,)


In [26]:
test_x, val_x, test_y, val_y,  = train_test_split(test_x, test_y, test_size=0.5, random_state=42 )

In [27]:
print(val_x.shape, val_y.shape)
print(test_x.shape, test_y.shape)

(54269, 22) (54269,)
(54268, 22) (54268,)


In [28]:
train_df = pd.concat([train_x, train_y], axis=1)
train_df.shape

(162805, 23)

In [29]:
test_df = pd.concat([test_x, test_y], axis=1)
test_df.shape

(54268, 23)

In [30]:
val_df = pd.concat([val_x, val_y], axis=1)
val_df.shape

(54269, 23)

In [31]:
val_df.head()

Unnamed: 0,full_text,text_translation,screen_name,description,desc_translation,weekofyear,weekday,day,month,year,...,latitude,longitude,altitude,province,hisco_standard,hisco_code,industry,sentiment_pattern,subjective_pattern,label
11367,RT @Ole_S_Hansen: The HG #copper fund short mo...,RT @Ole_S_Hansen: The HG #copper fund short mo...,CellarPolitics,Warehouse Coordinator | (Precious) Metals Zeal...,Warehouse Coordinator | (Precious) Metals Zeal...,6.0,6.0,9.0,2.0,2020.0,...,52.50017,5.748082,0.0,Flevoland,,,False,-0.1,0.1,neu
9462,RT @autistmijwat: @AutismeNVA @PASNederland @A...,RT @autistmijwat: @AutismeNVA @PASNederland @A...,monique_hl,,,12.0,0.0,16.0,3.0,2020.0,...,52.50017,5.748082,0.0,Flevoland,,,False,0.0,0.0,neu
26774,RT @utregsrealiste: @ang_haar Anders: zorgen o...,RT @utregsrealiste: @ang_haar Others worry abo...,Tarotfritts100,,,31.0,6.0,2.0,8.0,2020.0,...,0.0,0.0,0.0,False,,,False,0.275,0.8,pos
20712,"Nu gisteren, maar leuk als je tijd hebt om ter...","Now yesterday, but fun if you have time to lis...",Fortpampus,#PAMPUS UNESCO werelderfgoed / eiland / https:...,#PAMPUS UNESCO World / Island / https://t.co/e...,28.0,6.0,12.0,7.0,2020.0,...,0.0,0.0,0.0,False,,,False,0.6,0.966667,pos
13023,#carpoolen #op1 #corona ik denk dat carpoolen ...,#carpoolen # op1 #corona I think finally carpo...,warsdenker,dwarse denker. Waarom ?,lateral thinker. Why ?,26.0,2.0,24.0,6.0,2020.0,...,-46.897991,168.128378,0.0,False,,,False,0.35,0.9,pos


In [32]:
print(test_df.isnull().sum())

full_text             0
text_translation      0
screen_name           0
description           0
desc_translation      0
weekofyear            0
weekday               0
day                   0
month                 0
year                  0
location              0
point_info            0
point                 0
latitude              0
longitude             0
altitude              0
province              0
hisco_standard        0
hisco_code            0
industry              0
sentiment_pattern     0
subjective_pattern    0
label                 0
dtype: int64


In [33]:
result = train_df[:600].to_json(orient="records")

parsed = json.loads(result)

with open('train.jsonl', 'w') as outfile:
    json.dump(parsed, outfile)

In [34]:
result = test_df[:200].to_json(orient="records")

parsed = json.loads(result)

with open('test.jsonl', 'w') as outfile:
    json.dump(parsed, outfile)

In [35]:
result = val_df[:200].to_json(orient="records")

parsed = json.loads(result)

with open('val.jsonl', 'w') as outfile:
    json.dump(parsed, outfile)

In [36]:
%%time
train = pd.read_json("train.jsonl", encoding="utf-8")
print(train.shape)
train.head()

(600, 23)
CPU times: user 24.7 ms, sys: 6 µs, total: 24.7 ms
Wall time: 24 ms


Unnamed: 0,full_text,text_translation,screen_name,description,desc_translation,weekofyear,weekday,day,month,year,...,latitude,longitude,altitude,province,hisco_standard,hisco_code,industry,sentiment_pattern,subjective_pattern,label
0,"Maar , er iets nuttigs mee doen ? Zie jij 'm v...","However, there is something useful to do with ...",RonaldMeeuwis,,,21,3,21,5,2020,...,0.0,0.0,0,False,,,False,0.0,0.0,neu
1,RT @daphneskopelos: Door de coronacrisis zijn ...,RT @daphneskopelos: The corona crisis are abou...,IBeugel,"Journalist, programmamaker, schrijver, oud Bal...","Journalist, filmmaker, writer, former Balkans ...",23,2,3,6,2020,...,38.995368,21.987713,0,False,,,False,0.0,0.0,neu
2,RT @Geovation: What role do ethics and locatio...,RT @Geovation: What role do ethics play data a...,hanscees,"Systeemdenker, eigenaar https://t.co/5Cgd9GwmW...","Systems Thinker, owner https://t.co/5Cgd9GwmWt...",18,4,1,5,2020,...,52.50017,5.748082,0,Flevoland,,,False,0.0,0.0,neu
3,RT @MollyJongFast: Laura Ingraham is going to ...,RT @MollyJongFast: Laura Ingraham is going to ...,LDUniGr,CSO CarbExplore BV https://t.co/8IoRT28pWm\nem...,CSO CarbExplore BV https://t.co/8IoRT28pWm\nem...,21,0,18,5,2020,...,53.219065,6.568008,0,Groningen,,,False,0.0,0.0,neu
4,RT @kneeyockartee: Corona has robbed me of man...,RT @kneeyockartee: Corona has robbed me of man...,transxlucence,welkom in de chaos van mijn hoofd. geniet van ...,welcome to the chaos of my head. enjoy the con...,37,6,13,9,2020,...,52.094975,5.109708,0,Utrecht,,,False,0.0,0.0,neu


In [37]:
train['label'].value_counts()

neu    346
pos    157
neg     97
Name: label, dtype: int64

In [38]:
break here

SyntaxError: invalid syntax (<ipython-input-38-3ef470a3d0aa>, line 1)

In [None]:
%%time
test = pd.read_json("test.jsonl", encoding="utf-8")
print(test.shape)
test['label'].value_counts()

In [None]:
%%time
dev = pd.read_json("dev.jsonl", encoding="utf-8")
print(dev.shape)
dev['label'].value_counts()

In [None]:
dev['label'].isnull().sum()

In [None]:
%%time
f = open("test.jsonl")

In [None]:
filepath = "train.jsonl"
with open(filepath, encoding="utf-8") as f:
    tweets = json.load(f)
    for id_, data in enumerate(tweets):
        print(data["full_text"])
        if id_>5:
            break

In [None]:
for idx, row in enumerate(line):
    print(row)
    if idx > 10:
        break

In [None]:
!pip install jsonlines

In [None]:
import jsonlines

In [None]:
with jsonlines.open('test.jsonl') as reader:
    for id_, row in enumerate(reader):
        data = json.loads(row)
        print(data)
        if id_ >10:
            break

In [None]:
key = None
data[key]