# **Za Baransu**

## **Importing Library**

In [1]:
import pandas as pd
from sklearn.utils import resample

## **Loading the Data**

In [2]:
df1 = pd.read_csv("datasets/D1CrackingArena.csv")
df2 = pd.read_csv("datasets/D2Twitter.csv")
df3 = pd.read_csv("datasets/D3DreamMarket.csv")
df4 = pd.read_csv("datasets/D4Garage4hackers.csv")
df5 = pd.read_csv("datasets/D5CrackingFire.csv")

## **Observing Total Labelling**

In [3]:
print(df1['label'].value_counts())
print(df2['label'].value_counts())
print(df3['label'].value_counts())
print(df4['label'].value_counts())
print(df5['label'].value_counts())

label
NO           1520
YES           114
Undecided      48
Name: count, dtype: int64
label
NO           1633
YES           243
Undecided      51
Name: count, dtype: int64
label
NO           1638
YES           211
Undecided      72
Name: count, dtype: int64
label
NO           1704
YES           225
Undecided      37
Name: count, dtype: int64
label
NO           1864
YES            89
Undecided      21
Name: count, dtype: int64


## **Combining the Datasets**

In [4]:
# Combine all datasets into one df
combined_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

# Check the distribution of labels
print(combined_df['label'].value_counts())

label
NO           8359
YES           882
Undecided     229
Name: count, dtype: int64


## **Dealing With Imbalanced Dataset**

In [5]:
# Separate the data by labels
yes_df = combined_df[combined_df['label'] == 'YES']
no_df = combined_df[combined_df['label'] == 'NO']

In [6]:
# Choose the minority class
min_class_count = int(len(yes_df) * 1.2)

In [7]:
# Oversampling "YES" (increase by 20%)
new_yes_df = resample(yes_df, replace=True, n_samples=min_class_count, random_state=42)

In [8]:
# Downsampling "NO" to match the size of oversampled "YES"
new_no_df = resample(no_df, replace=False, n_samples=min_class_count, random_state=42)

In [9]:
# Combine the "YES" and "NO" labels
new_df = pd.concat([new_yes_df, new_no_df], ignore_index=True)

# Shuffle the dataset
new_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the distribution of labels
print(new_df['label'].value_counts())

label
NO     1058
YES    1058
Name: count, dtype: int64


## **Exporting the Dataset**

In [10]:
# Export the dataset into a CSV file
new_df.to_csv("datasets/combined_dataset.csv", index=False)

# Preview the dataset
print(new_df.head(10))

                                             content           dataset label
0  PrestaShop Recipes A Problem Solution Approach...     D3DreamMarket    NO
1    Nice post bond.  Just remove typo -       Or...  D4Garage4hackers    NO
2    Latest version of wifite (v2) has also inclu...  D4Garage4hackers   YES
3    On Sunday, the 28th of November 2010 around ...  D4Garage4hackers   YES
4  I have to strongly disagree with the statement...         D2Twitter    NO
5  [RT] [USERNAME] Hack Remote Windows 10 Passwor...         D2Twitter   YES
6                       how to crack realityking.com    D2CrackingFire    NO
7  THE BEST PRO WIFI HACKING TOOLS PACK 2017  Her...     D3DreamMarket   YES
8  [RT] [USERNAME] Analysing the NULL SecurityDes...         D2Twitter   YES
9  Quote: Originally Posted by ShockiNN Well the ...   D1CrackingArena    NO
