In [3]:
import pandas as pd 
df=pd.read_csv('greatlearning_ml_questions.csv')
df.head()

Unnamed: 0,question,answer
0,Explain the terms Artificial Intelligence (AI)...,Artificial Intelligence (AI) is the domain of ...
1,What are the different types of Learning/ Trai...,ML algorithms can be primarily classified depe...
2,What is the main key difference between superv...,There are various means to select important va...
3,There are many machine learning algorithms til...,Machine Learning algorithm to be used purely d...
4,State the differences between causality and co...,Causality applies to situations where one acti...


In [31]:
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()


In [39]:
import re 
# Pour la colonne question - garde les lettres, chiffres, espaces et "?"
df['question'] = df['question'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s?]', '', x))

# Pour la colonne answer - garde uniquement lettres, chiffres et espaces (sans "?")
df['answer'] = df['answer'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Nettoyage des espaces au début et à la fin
df['question'] = df['question'].str.strip()
df['answer'] = df['answer'].str.strip()

In [41]:
 df.to_csv('clean_data.csv', index=False, encoding='utf-8')

In [44]:
import pandas as pd
import re

def clean_ml_qa_dataset(df):
    """
    Clean the ML Q&A dataset by:
    - Ensuring questions end with "?"
    - Removing "Ans." prefix from answers
    - Cleaning special characters while preserving important punctuation
    
    Args:
        df: pandas DataFrame with 'question' and 'answer' columns
    
    Returns:
        cleaned DataFrame
    """
    # Make a copy to avoid modifying the original
    cleaned_df = df.copy()
    
    def clean_question(question):
        # Remove special characters but keep ? . , ( )
        question = re.sub(r'[^a-zA-Z0-9\s\?\.\,\(\)]', '', str(question))
        # Ensure question ends with ?
        if not question.strip().endswith('?'):
            question = question.strip() + '?'
        # Remove multiple spaces
        question = ' '.join(question.split())
        return question
    
    def clean_answer(answer):
        # Remove "Ans." prefix if it exists
        answer = re.sub(r'^Ans\.\s*', '', str(answer))
        # Remove special characters but keep ? . , ( )
        answer = re.sub(r'[^a-zA-Z0-9\s\?\.\,\(\)]', '', answer)
        # Remove multiple spaces
        answer = ' '.join(answer.split())
        return answer
    
    # Apply cleaning functions
    cleaned_df['question'] = cleaned_df['question'].apply(clean_question)
    cleaned_df['answer'] = cleaned_df['answer'].apply(clean_answer)
    
    return cleaned_df

# Example usage
if __name__ == "__main__":
    # Sample data
    data = {
        'question': ['What is machine learning', 'How do you handle missing data?', 'Explain ROC curve?'],
        'answer': ['Ans. Machine learning is...', 'We can handle missing data by...', 'ROC curve represents...']
    }
    df = pd.DataFrame(data)
    
    # Clean the data
    cleaned_df = clean_ml_qa_dataset(df)
    print("Cleaned DataFrame:")
    print(cleaned_df)

Cleaned DataFrame:
                          question                            answer
0        What is machine learning?            Machine learning is...
1  How do you handle missing data?  We can handle missing data by...
2               Explain ROC curve?           ROC curve represents...


In [46]:
# Lire le fichier CSV
df = pd.read_csv('greatlearning_ml_questions.csv')

# Appliquer le nettoyage
cleaned_df = clean_ml_qa_dataset(df)

# Sauvegarder le résultat
cleaned_df.to_csv('cleaned_ml_questions.csv', index=False)