In [24]:
%pip install tabulate


Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from tabulate import tabulate

# Replace 'your_dataset.csv' with the path to your dataset

dataset = pd.read_csv('../dataset/Bitext.csv')

# Display the first few rows of the dataset

df = pd.DataFrame(dataset)

# Display the original DataFrame

print("Original DataFrame:")

# Check for rows with any data

# This will return a boolean Series indicating if each row has any non-null values

has_data = df.notnull().any(axis=1)

# Display rows that have data

rows_with_data = df[has_data]

# Display the result

print("\nRows with any data:")
print(rows_with_data)

# Optionally, if you want to check if the entire DataFrame is empty

is_empty = df.empty
print("\nIs the DataFrame empty?", is_empty)


Original DataFrame:

Rows with any data:
       flags                                        instruction category  \
0          B   question about cancelling order {{Order Number}}    ORDER   
1        BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2       BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3         BL         I need to cancel purchase {{Order Number}}    ORDER   
4      BCELN  I cannot afford this order, cancel purchase {{...    ORDER   
...      ...                                                ...      ...   
26867     BL  I am waiting for a rebate of {{Refund Amount}}...   REFUND   
26868    BIL  how to see if there is anything wrong with my ...   REFUND   
26869   BLQZ  I'm waiting for a reimbjrsement of {{Currency ...   REFUND   
26870     BL  I don't know what to do to see my reimbursemen...   REFUND   
26871     BL  I need to know if there is anything new on the...   REFUND   

             intent                           

In [11]:
# Method 1: Using shape attribute

total_rows_shape = df.shape[0]
print("\nTotal number of rows using shape:", total_rows_shape)

# Method 2: Using len() function

total_rows_len = len(df)
print("Total number of rows using len():", total_rows_len)



Total number of rows using shape: 26872
Total number of rows using len(): 26872


# Flags Column


In [15]:
# Count unique values and their total occurrences

unique_values_count = df['flags'].value_counts()
total_values = unique_values_count.sum()

# Display the total unique values and their combined total occurrences

total_unique = len(unique_values_count)

print("Total unique values:", total_unique)
print("Total occurrences of all values:", total_values)


Total unique values: 394
Total occurrences of all values: 26872


# Instruction Column


In [16]:
# Count unique values and their total occurrences

total_instruction = len(df['instruction'])

# Display the total unique values and their combined total occurrences

print("Total unique values:", total_instruction)


Total unique values: 26872


# Category Column


In [4]:
# Count unique values and their occurrences

unique_values_count = df['category'].value_counts()
total_sum = unique_values_count.sum()
unique_values = unique_values_count.index.tolist()

# Display the total unique values, their names, and summed total

print("Total unique values:", len(unique_values))
print("Unique values:", unique_values)
print("Total occurrences of all values:", total_sum)


Total unique values: 11
Unique values: ['ACCOUNT', 'ORDER', 'REFUND', 'INVOICE', 'CONTACT', 'PAYMENT', 'FEEDBACK', 'DELIVERY', 'SHIPPING', 'SUBSCRIPTION', 'CANCEL']
Total occurrences of all values: 26872


# Intent Column


In [2]:
import pandas as pd
from tabulate import tabulate

# Assuming df is your DataFrame
# Group intents by category and count occurrences
grouped_data = df.groupby(['category', 'intent']).size().reset_index(name='count')

# Calculate the total counts and the number of unique intents per category
category_summary = grouped_data.groupby('category').agg(
    total_counts=('count', 'sum'),
    unique_intents=('intent', 'nunique')
).reset_index()

# Create a summary table for each category
for category in category_summary['category']:
    category_data = grouped_data[grouped_data['category'] == category]
    total_counts = category_data['count'].sum()
    unique_intents = category_data['intent'].nunique()
    total_row = pd.DataFrame({
        'category': [category],
        'intent': ['Total'],
        'count': [total_counts]
    })
    category_data = pd.concat([category_data, total_row], ignore_index=True)
    
    # Convert the DataFrame to a tabular format
    table = tabulate(category_data, headers='keys', tablefmt='grid')
    
    # Display the table
    print(f"Category: {category}")
    print(table)
    print("\n")


Category: ACCOUNT
+----+------------+-----------------------+---------+
|    | category   | intent                |   count |
|  0 | ACCOUNT    | create_account        |     997 |
+----+------------+-----------------------+---------+
|  1 | ACCOUNT    | delete_account        |     995 |
+----+------------+-----------------------+---------+
|  2 | ACCOUNT    | edit_account          |    1000 |
+----+------------+-----------------------+---------+
|  3 | ACCOUNT    | recover_password      |     995 |
+----+------------+-----------------------+---------+
|  4 | ACCOUNT    | registration_problems |     999 |
+----+------------+-----------------------+---------+
|  5 | ACCOUNT    | switch_account        |    1000 |
+----+------------+-----------------------+---------+
|  6 | ACCOUNT    | Total                 |    5986 |
+----+------------+-----------------------+---------+


Category: CANCEL
+----+------------+------------------------+---------+
|    | category   | intent                 |

In [3]:

# 1. Data Cleaning
## Remove duplicates
df_unique = df.drop_duplicates(subset=['instruction', 'response'])


## Consistency in formatting (e.g., make all text lowercase)
df['instruction'] = df['instruction'].str.lower()
df['response'] = df['response'].str.lower()

# 2. Normalization
## Placeholder formatting consistency
df['instruction'] = df['instruction'].str.replace(r'\{\{order number\}\}', '{{Order Number}}')

# 4. Categorization Review
## Check the balance of intents (if needed)
intent_counts = df['intent'].value_counts()

# 5. Tokenization (Using a tokenization library, e.g., nltk or sentencepiece)
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')  # Download required resources for tokenizing

df['tokenized_instructions'] = df['instruction'].apply(word_tokenize)


# 8. Validation (Simple data quality check - Check for NaNs)
missing_data = df.isnull().sum()

# 9. Test Set Creation (Splitting the dataset)
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['intent'])
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df['intent'])

# Save the preprocessed data to CSV
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)


[nltk_data] Downloading package punkt to /home/reagan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
