# **Data Cleaning sample and Working with Local Database connection**

In [1]:
# Import modules
import sys
import os
import pandas as pd

In [2]:
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
# Now import the modules
from data_cleaning import load_csv, clean_dataframe, save_cleaned_data

### **Load and Inspect Raw Data**

In [4]:
df = load_csv("../data/messages/scraped_messages.csv")

# Show first few rows
df.head(10)


Unnamed: 0,channel_name,channel_title,date,text
0,DoctorsET,Doctors Ethiopia,2023-12-18 17:04:02,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...
1,DoctorsET,Doctors Ethiopia,2023-11-03 16:14:39,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...
2,DoctorsET,Doctors Ethiopia,2023-10-02 16:37:39,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...
3,DoctorsET,Doctors Ethiopia,2023-09-16 07:54:32,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...
4,DoctorsET,Doctors Ethiopia,2023-09-01 16:16:15,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...
5,DoctorsET,Doctors Ethiopia,2023-08-29 17:20:05,👇👇👇👇👇👇 https://youtu.be/-AR1KO2DbFw?si=47cXLZt...
6,DoctorsET,Doctors Ethiopia,2022-08-02 17:42:08,ክረምቱን ስፖርት መስራት አስበው ጂም ለመግባት ካልቻሉ ባሉበት ቦታ ሆነው...
7,DoctorsET,Doctors Ethiopia,2022-06-12 17:15:47,ስፖርት የመስራት ሱስ ይኖር ይሆን?\n\nበአሁኑ ወቅት ብዙ የስፖርት መስ...
8,DoctorsET,Doctors Ethiopia,2022-05-31 17:51:13,ድንገተኛ አደጋ / የአጥንት ስብራት\n\nአያርገውና ድንገተኛ የሆነ አደጋ...
9,DoctorsET,Doctors Ethiopia,2022-05-20 18:04:53,ከትንሽ ግዚያት በፊት ስፖርት መስራት እንደ ቅንጦት ይታይ ነበር አሁን ላ...


### **Clean and Standardize Data**

In [5]:
df_cleaned = clean_dataframe(df)

# Display cleaned dataset
df_cleaned.head(50)

Unnamed: 0,channel_name,channel_title,text_date,message,emoji_used
0,DoctorsET,Doctors Ethiopia,2023-12-18 17:04:02,"በቀን አንዴ ብቻ የሚባለው የቢዝነስ አማካሪ በ 10,000 ብር ብቻ የተ...",👈👈👇👇
1,DoctorsET,Doctors Ethiopia,2023-11-03 16:14:39,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,👇
2,DoctorsET,Doctors Ethiopia,2023-10-02 16:37:39,ሞት በስኳር ለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀንሰው ...,No emoji
3,DoctorsET,Doctors Ethiopia,2023-09-16 07:54:32,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ? ሙሉ ቪ...,👇👇👇👇
4,DoctorsET,Doctors Ethiopia,2023-09-01 16:16:15,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,No emoji
5,DoctorsET,Doctors Ethiopia,2023-08-29 17:20:05,** ዶክተርስ ኢትዮጽያ በአዲስ ፕሮገራም ጀመረ** ማረጥ (**ሜኖፖዝ )...,👇👇👇👇👇👇
6,DoctorsET,Doctors Ethiopia,2022-08-02 17:42:08,ክረምቱን ስፖርት መስራት አስበው ጂም ለመግባት ካልቻሉ ባሉበት ቦታ ሆነው...,👇👇👇👇👇
7,DoctorsET,Doctors Ethiopia,2022-06-12 17:15:47,ስፖርት የመስራት ሱስ ይኖር ይሆን? በአሁኑ ወቅት ብዙ የስፖርት መስሪያ ...,👇👇👇👇👇👇
8,DoctorsET,Doctors Ethiopia,2022-05-31 17:51:13,ድንገተኛ አደጋ / የአጥንት ስብራት አያርገውና ድንገተኛ የሆነ አደጋ ቢደ...,👇👇👇👇👇👇👇
9,DoctorsET,Doctors Ethiopia,2022-05-20 18:04:53,ከትንሽ ግዚያት በፊት ስፖርት መስራት እንደ ቅንጦት ይታይ ነበር አሁን ላ...,👇👇👇👇👇👇


In [6]:
# Check for missing values in the cleaned DataFrame
missing_values = df_cleaned.isnull().sum()
missing_values[missing_values > 0] 

Series([], dtype: int64)

In [7]:
# Get the dimensions of the DataFrame: (number of rows, number of columns)
df_cleaned.shape

(3828, 5)

The output (3828, 5) indicates that the DataFrame contains 3828 rows and 5 columns.

In [8]:
# Get the total number of elements in the DataFrame (rows * columns)
df_cleaned.size

19140

### **Save the Cleaned Data**

In [9]:
# Save cleaned data to CSV
save_cleaned_data(df_cleaned, "../data/processed/cleaned_telegram_data.csv")

✅ Cleaned data saved successfully to '../data/processed/cleaned_telegram_data.csv'.


# **Connect to Database**

In [13]:
from database_setup import get_db_connection, create_table, insert_data
engine = get_db_connection()

###  **Create Table in PostgreSQL**

In [14]:
create_table(engine)

###  **Insert Data into Database**

In [15]:
# Load the cleaned CSV into a DataFrame
cleaned_df = pd.read_csv("../data/cleaned_telegram_data.csv")

In [16]:
# Ensure the 'message_date' column is in datetime format (to prevent NaT issues)
cleaned_df["text_date"] = pd.to_datetime(cleaned_df["text_date"], errors="coerce")

# Check if there are any missing values before inserting
missing_values = cleaned_df.isnull().sum()
print("Missing Values Before Insert:", missing_values)

Missing Values Before Insert: channel_name     0
channel_title    0
text_date        0
message          3
emoji_used       0
dtype: int64
