# Sales_(oasis_1) - Cleaned Notebook
This notebook was reconstructed from the provided content and saved in a valid JSON structure. It contains the main import, data loading, preprocessing, EDA and cleaning steps.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt')


In [None]:
# Load datasets (update paths if needed)
conversation_df = pd.read_csv('/Conversation.csv')
quotes_df = pd.read_csv('/train (1).csv')
print('Conversation shape:', conversation_df.shape)
print('Quotes shape:', quotes_df.shape)


In [None]:
# Basic preprocessing helpers
def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Combine datasets into a single list and save training_data.txt
all_text = []
all_text.extend(conversation_df['question'].dropna().astype(str).tolist())
all_text.extend(conversation_df['answer'].dropna().astype(str).tolist())
all_text.extend(quotes_df.iloc[:,0].dropna().astype(str).tolist())
processed_text = [preprocess_text(t) for t in all_text if t]
processed_text = [t for t in processed_text if len(t) > 0]
with open('training_data.txt', 'w', encoding='utf-8') as f:
    for t in processed_text:
        f.write(t + '\n')
print('Saved training_data.txt with', len(processed_text), 'lines')


In [None]:
# Simple EDA - top words
sample_texts = processed_text[:1000]
all_words = []
for txt in sample_texts:
    all_words.extend(word_tokenize(txt))
word_freq = Counter(all_words)
most_common = word_freq.most_common(20)
print('Top 20 words:', most_common)


In [None]:
# Combine into a dataframe for cleaning
temp_quotes = quotes_df.copy()
temp_quotes['question'] = np.nan
temp_quotes['answer'] = np.nan
temp_conv = conversation_df.copy()
if 'Unnamed: 0' in temp_conv.columns:
    temp_conv = temp_conv.drop(columns=['Unnamed: 0'])
temp_conv['Quotes'] = np.nan
cols = ['Quotes','question','answer']
temp_quotes = temp_quotes[cols]
temp_conv = temp_conv[cols]
df = pd.concat([temp_quotes, temp_conv], ignore_index=True)
# Normalize columns
for c in ['Quotes','question','answer']:
    df[c] = df[c].astype(str).apply(preprocess_text)
# Save combined dataframe for later use
df.to_csv('combined_data_cleaned.csv', index=False, encoding='utf-8')
print('Saved combined_data_cleaned.csv with', len(df), 'rows')


Notes:
- This notebook is a cleaned, valid JSON reconstruction focusing on the code cells from the provided content.
- If you want the full original notebook (with all markdown and outputs) restored exactly, please provide the complete raw .ipynb JSON file (it appears the original was truncated/invalid).
