# **Phase 3**

Missing Values

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("kG nlp.csv")

# Step 1: Check for missing values in each column
print("Missing Values Check:")
print("Number of missing values in each column:")
print(df.isnull().sum())

# Step 2: Calculate the percentage of missing values
print("\nPercentage of missing values in each column:")
print((df.isnull().sum() / len(df) * 100).round(2))

# Step 3: Handle missing values (if any)
# Since no missing values were found, this step is for demonstration purposes
if df.isnull().sum().sum() == 0:
    print("\nNo missing values found in the dataset. No action required.")
else:
    # Example handling strategies (for reference, not executed in this case):
    # Option 1: Drop rows with missing values
    # df = df.dropna()
    # Option 2: Fill missing values (e.g., with a placeholder)
    # df = df.fillna({'source': 'Unknown', 'target': 'Unknown', 'relation': 'Unknown'})
    print("Missing values were found and would be handled here (e.g., dropping rows or filling with placeholders).")

# Step 4: Verify the dataset after handling missing values
print("\nDataset Info After Missing Values Check:")
print(df.info())

# Optional: Save the dataset (no changes made since no missing values were found)
df.to_csv("kG_nlp_missing_values_checked.csv", index=False)

# Optional: Display the first few rows of the dataset
print("\nPreview of the dataset after missing values check:")
print(df.head())

Missing Values Check:
Number of missing values in each column:
sentence    0
source      0
target      0
relation    0
tokens      0
tags        0
dtype: int64

Percentage of missing values in each column:
sentence    0.0
source      0.0
target      0.0
relation    0.0
tokens      0.0
tags        0.0
dtype: float64

No missing values found in the dataset. No action required.

Dataset Info After Missing Values Check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43594 entries, 0 to 43593
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  43594 non-null  object
 1   source    43594 non-null  object
 2   target    43594 non-null  object
 3   relation  43594 non-null  object
 4   tokens    43594 non-null  object
 5   tags      43594 non-null  object
dtypes: object(6)
memory usage: 2.0+ MB
None

Preview of the dataset after missing values check:
                                            sentence     source     tar

Removing Duplicates rows

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("kG nlp.csv")

# Display the initial number of rows
print(f"Number of rows before removing duplicates: {len(df)}")

# Remove duplicates by comparing all columns
df_no_duplicates = df.drop_duplicates()

# Display the number of rows after removing duplicates
print(f"Number of rows after removing duplicates: {len(df_no_duplicates)}")

# Save the cleaned dataset to a new CSV file (optional)
df_no_duplicates.to_csv("kG_nlp_no_duplicates.csv", index=False)

# Optional: Display the first few rows of the cleaned dataset
print("\nPreview of the dataset after removing duplicates:")
print(df_no_duplicates.head())

Number of rows before removing duplicates: 43594
Number of rows after removing duplicates: 24715

Preview of the dataset after removing duplicates:
                                            sentence     source     target  \
0               a group of soldiers provide security      group   security   
1  this beautiful waves ring is crafted in platin...  beautiful      waves   
2  A plate of salad that includes strawberries on...       that       side   
3  map from which users can select the district i...       they  hospitals   
4      coach is given a guard of honour from players      coach    players   

            relation                                             tokens  \
0        ['provide']  ['a', 'group', 'of', 'soldiers', 'provide', 's...   
1  ['crafted', 'in']  ['this', 'beautiful', 'waves', 'ring', 'is', '...   
2    ['plate', 'of']  ['A', 'plate', 'of', 'salad', 'that', 'include...   
3    ['map', 'from']  ['map', 'from', 'which', 'users', 'can', 'sele...   
4       

Data Transformation

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv("kG_nlp_no_duplicates.csv")

# Display the initial state of the 'relation' column
print("Before transformation - Sample of 'relation' column:")
print(df['relation'].head())

# Transform the 'relation' column: Convert string representation of lists to a single string
# Example: "['grow', 'on']" -> "grow on"
df['relation'] = df['relation'].apply(lambda x: ' '.join(eval(x)).strip() if isinstance(x, str) else x)

# Display the transformed 'relation' column
print("\nAfter transformation - Sample of 'relation' column:")
print(df['relation'].head())

# The 'tags' column is already structured (e.g., SRC, TGT, REL) and requires no transformation
print("\nSample of 'tags' column (no transformation needed):")
print(df['tags'].head())

# Save the transformed dataset to a new CSV file (optional)
df.to_csv("kG_nlp_transformed.csv", index=False)

# Optional: Display the first few rows of the transformed dataset
print("\nPreview of the dataset after transformation:")
print(df.head())

Before transformation - Sample of 'relation' column:
0          ['provide']
1    ['crafted', 'in']
2      ['plate', 'of']
3      ['map', 'from']
4            ['given']
Name: relation, dtype: object

After transformation - Sample of 'relation' column:
0       provide
1    crafted in
2      plate of
3      map from
4         given
Name: relation, dtype: object

Sample of 'tags' column (no transformation needed):
0                 ['O', 'SRC', 'O', 'O', 'REL', 'TGT']
1    ['O', 'SRC', 'TGT', 'O', 'O', 'REL', 'REL', 'O...
2    ['O', 'REL', 'REL', 'O', 'SRC', 'O', 'O', 'O',...
3    ['REL', 'REL', 'O', 'O', 'O', 'O', 'SRC', 'O',...
4    ['SRC', 'O', 'REL', 'SRC', 'O', 'O', 'O', 'O',...
Name: tags, dtype: object

Preview of the dataset after transformation:
                                            sentence     source     target  \
0               a group of soldiers provide security      group   security   
1  this beautiful waves ring is crafted in platin...  beautiful      waves   
2  A 