In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Turkish Split

In [51]:
turkish_df = pd.read_csv("sentiment/raw/TurkishTweets.csv", encoding = "ISO-8859-1")

In [52]:
turkish_df.columns = ["Label", "Tweets", "1", "2", "3", "4"]
turkish_df = turkish_df[["Label", "Tweets"]]

In [53]:
turkish_df.loc[turkish_df["Label"] == "Negatif", "Label"] = "Negative"
turkish_df.loc[turkish_df["Label"] == " Negatif", "Label"] = "Negative"
turkish_df.loc[turkish_df["Label"] == "Pozitif", "Label"] = "Positive"

In [54]:
turkish_df

Unnamed: 0,Label,Tweets
0,Negative,Doða aðzýmýza sýçsa hakký var
1,Positive,"Anne bir sanatçýdýr, en güzel eseri de yavrusu..."
2,Negative,ibrahimin oðlunu koruyan Tanrýya da ben sokayým
3,Negative,Köpeðim suratýna sýçsýn senin namussuz karý
4,Negative,"Ben söðüþledim, birazda sen söðüþle"
...,...,...
11106,Negative,eðer gidip bir aynaya bakarsanýz orada en büyü...
11107,Negative,Asýl fahiþe senin beynin
11108,Negative,Sorsak bir numaralý müslümandýr sýfatsýz herif
11109,Negative,surata bak lenetlenmiþ oç


In [55]:
# Counts for each class
print(turkish_df["Label"].value_counts().to_dict())

{'Positive': 6111, 'Negative': 5000}


## Split Data

In [56]:
train_df, temp_df = train_test_split(turkish_df, test_size=0.2, random_state=14, stratify=turkish_df["Label"])
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=14, stratify=temp_df["Label"])

In [57]:
len_train = len(train_df)
len_test = len(test_df)
len_val = len(val_df)

(len_train, len_test, len_val)

(8888, 1111, 1112)

In [58]:
# Counts for each class
print(train_df["Label"].value_counts().to_dict())
print(val_df["Label"].value_counts().to_dict())
print(test_df["Label"].value_counts().to_dict())

{'Positive': 4888, 'Negative': 4000}
{'Positive': 612, 'Negative': 500}
{'Positive': 611, 'Negative': 500}


In [60]:
train_df.to_csv("sentiment/splits/turkish/train.csv", index=False)
test_df.to_csv("sentiment/splits/turkish/test.csv", index=False)
val_df.to_csv("sentiment/splits/turkish/val.csv", index=False)

# English Split

In [61]:
english_df = pd.read_csv("sentiment/raw/EnglishTweets.txt", sep="\t", names = ['Label', 'Tweets'])

In [62]:
english_df.loc[english_df["Label"] == 0, "Label"] = "Negative"
english_df.loc[english_df["Label"] == 1, "Label"] = "Positive"

In [63]:
english_sampled_df = english_df.sample(n=turkish_df.shape[0])

In [64]:
english_sampled_df

Unnamed: 0,Label,Tweets
145316,Positive,"@nicksantino buffalo?! dude, I live in buffalo..."
125877,Negative,"i had a coffee toffee twisted frosty, and i mu..."
113814,Positive,so i came to the conclusion a trip to disneyla...
91078,Positive,@pinkelephantpun I come from a long line of It...
149090,Negative,I got stung by something
...,...,...
120668,Negative,Computer hit by virus via FB. It sent virus to...
73832,Positive,@jemimah_knight I get really tired of people w...
64050,Positive,@RachelLock22 u find the BEST v's videos hhaha
18444,Positive,"@z357x I'd have to agree, rofl. You aren't lame"


In [65]:
# Counts for each class
print(english_sampled_df["Label"].value_counts().to_dict())

{'Positive': 5562, 'Negative': 5549}


## Split Data

In [66]:
train_df, temp_df = train_test_split(english_sampled_df, test_size=0.2, random_state=14, stratify=english_sampled_df["Label"])
test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=14, stratify=temp_df["Label"])

In [67]:
len_train = len(train_df)
len_test = len(test_df)
len_val = len(val_df)

(len_train, len_test, len_val)

(8888, 1111, 1112)

In [68]:
# Counts for each class
print(train_df["Label"].value_counts().to_dict())
print(val_df["Label"].value_counts().to_dict())
print(test_df["Label"].value_counts().to_dict())

{'Positive': 4449, 'Negative': 4439}
{'Positive': 557, 'Negative': 555}
{'Positive': 556, 'Negative': 555}


In [69]:
train_df.to_csv("sentiment/splits/english/train.csv", index=False)
test_df.to_csv("sentiment/splits/english/test.csv", index=False)
val_df.to_csv("sentiment/splits/english/val.csv", index=False)