<a href="https://colab.research.google.com/github/Suryansh-gp/DEVA_AI/blob/main/DEVA_AI_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install crewai gradio



***Libraries***

In [None]:
import pandas as pd
import numpy as np
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
from crewai import Agent
import os


import warnings
warnings.filterwarnings('ignore')

# Set the environment variable for OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""

***Problem Identifier Agent***

In [None]:
# Problem Identifier Agent
class ProblemIdentifierAgent:
    def __init__(self, role, goal, backstory):
        self.role = role
        self.goal = goal
        self.backstory = backstory

    def run(self, dataset_details):
        dataset_name = dataset_details['name']
        dataset_columns = dataset_details['columns']

        problem_statements = self.analyze_data(dataset_columns)

        for problem in problem_statements:
            problem['suggested_algorithms'] = self.suggest_algorithms(problem['statement'])

        return {
            "dataset_name": dataset_name,
            "problem_statements": problem_statements
        }

    def analyze_data(self, columns):
        problem_statements = []

        # Customer Churn Prediction
        if all(feature in columns for feature in ["tenure", "Contract", "MonthlyCharges", "TotalCharges", "InternetService", "OnlineSecurity", "TechSupport", "PaymentMethod", "Churn"]):
            problem_statements.append({
                "statement": "Predict customer churn based on tenure, contract type, and service usage."
            })

        # Customer Segmentation for Personalized Marketing
        if all(feature in columns for feature in ["gender", "SeniorCitizen", "Partner", "Dependents", "tenure", "PhoneService", "MultipleLines", "InternetService", "StreamingTV", "StreamingMovies", "Contract", "PaymentMethod", "MonthlyCharges"]):
            problem_statements.append({
                "statement": "Segment customers for personalized marketing based on demographics and service usage."
            })

        # Service Quality Improvement and Complaint Reduction
        if all(feature in columns for feature in ["TechSupport", "DeviceProtection", "OnlineSecurity", "InternetService", "MultipleLines", "StreamingTV", "StreamingMovies", "Churn"]):
            problem_statements.append({
                "statement": "Analyze service features to improve quality and reduce customer complaints."
            })

        # Customer Lifetime Value (CLV) Prediction
        if all(feature in columns for feature in ["tenure", "Contract", "MonthlyCharges", "TotalCharges", "PaymentMethod", "InternetService", "StreamingTV", "StreamingMovies"]):
            problem_statements.append({
                "statement": "Predict customer lifetime value based on service usage and contract details."
            })

        return problem_statements[:3]

    def suggest_algorithms(self, problem_statement):
        # Suggest classification algorithms based on the problem statement
        if "churn" in problem_statement.lower():
            return ["Logistic Regression", "Random Forest", "Support Vector Machine"]
        elif "segment" in problem_statement.lower():
            return ["K-Means Clustering", "Hierarchical Clustering"]
        elif "service quality" in problem_statement.lower():
            return ["Decision Trees", "Random Forest", "Gradient Boosting"]
        elif "lifetime value" in problem_statement.lower():
            return ["Linear Regression", "Random Forest", "XGBoost"]
        else:
            return ["Logistic Regression", "Naive Bayes", "K-Nearest Neighbors"]

# Function to process input and return output
def process_input(dataset_columns_input):
    dataset_columns = [column.strip() for column in dataset_columns_input.split(",")]

    dataset_details = {
        "name": "Default Dataset Name",
        "description": "Default Dataset Description",
        "columns": dataset_columns
    }

    # Instantiate the agent with required parameters
    role = "Problem Statement Identifier"
    goal = "Identify problem statements related to customer churn and suggest algorithms."
    backstory = "This agent analyzes dataset columns to generate relevant problem statements."

    agent = ProblemIdentifierAgent(role, goal, backstory)
    results = agent.run(dataset_details)

    output = ""
    for problem in results['problem_statements']:
        output += f"""
        <div style="border: 1px solid #007BFF; border-radius: 8px; padding: 16px; margin: 8px; width: 300px; display: inline-block; background-color: black; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);">
            <h4 style="color: #007BFF;">Problem Statement:</h4>
            <p style="color:white;">{problem['statement']}</p>
            <h5 style="color: #28a745;">Suggested Algorithms:</h5>
            <p>{', '.join(problem['suggested_algorithms'])}</p>
        </div>
        """

    return output


In [None]:
# Gradio Interface for Problem Identifier Agent
iface = gr.Interface(
    fn=process_input,
    inputs=[
        gr.Textbox(label="Dataset Features (comma-separated)")
    ],
    outputs=gr.HTML(),
    title="DEVA AI",
    description="## Problem Identifier Agent\nEnter the column names to identify relevant problem statements and suggested algorithms.",
    theme="default"
)

if __name__ == "__main__":
    iface.launch()

In [None]:
#gr.close_all()

***Data Ingestion Agent***

In [None]:
# Data Ingestion Agent
class DataIngestionAgent(Agent):
    def __init__(self, role, goal, backstory):
        super().__init__(role=role, goal=goal, backstory=backstory)

    def load_data(self, file_url):
        try:
            # Check the file extension
            if file_url.endswith('.csv'):
                telecom_data = pd.read_csv(file_url)
            elif file_url.endswith('.xlsx'):
                telecom_data = pd.read_excel(file_url)
            else:
                raise ValueError("Unsupported file extension. Please provide a .csv or .xlsx file.")

            print("\nDataset loaded successfully.")
            return telecom_data
        except Exception as e:
            print(f"Error loading data: {e}")
            return None

    def explore_data(self, telecom_data):
        print("\nPreview of the dataset:\n")
        display(telecom_data.head(10))

        print("\nBasic statistics:\n")
        display(telecom_data.describe())

        print(f"\nNumber of rows: {telecom_data.shape[0]}")
        print(f"\nNumber of columns: {telecom_data.shape[1]}")

        # Display data types
        print("\nData Info:")
        display(telecom_data.info())

    def run(self, file_url):
        telecom_data = self.load_data(file_url)

        if telecom_data is not None:
            self.explore_data(telecom_data)
            return telecom_data
        else:
            return None


if __name__ == "__main__":
    dataset_url = "https://raw.githubusercontent.com/Suryansh-gp/DEVA_AI/main/Telecom_Churn.csv"

    role = "Data Ingestion Agent"
    goal = "Ingest the telecom customer churn dataset."
    backstory = "This agent loads and explores the dataset for analysis."

    agent = DataIngestionAgent(role, goal, backstory)

    telecom_data = agent.run(dataset_url)

***Data Preprocessing Agent***

In [None]:
from IPython.display import display, Markdown
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

class DataPreprocessingAgent:
    def __init__(self, dataframe):
        self.df = dataframe

    # Handle Missing data
    def handle_missing_values(self, strategy='mean', threshold=30):
        """Handle missing values based on a threshold."""
        display(Markdown("\n***Step 1: Handling missing values...***\n"))
        missing_data = self.df.isnull().sum()
        total_missing = missing_data.sum()

        if total_missing == 0:
            print("\nNo missing values found.\n")
        else:
            print(f"\nTotal missing values found: {total_missing}\n")

        for column, missing_count in missing_data.items():
            if missing_count > 0:
                print(f"\nColumn '{column}' has {missing_count} missing values.\n")
                if missing_count <= threshold:
                    print(f"\nDropping rows with missing values in '{column}' (count below threshold).\n")
                    self.df = self.df.dropna(subset=[column])
                else:
                    print(f"\nFilling missing values in '{column}' using {strategy}.\n")
                    if strategy == 'mean' and self.df[column].dtype in ['int64', 'float64']:
                        self.df[column].fillna(self.df[column].mean(), inplace=True)
                    elif strategy == 'median' and self.df[column].dtype in ['int64', 'float64']:
                        self.df[column].fillna(self.df[column].median(), inplace=True)
                    elif strategy == 'mode':
                        self.df[column].fillna(self.df[column].mode().iloc[0], inplace=True)

        print("\n---------------------------------------\n")  # Separator for clarity
        return self.df

    #Remove Duplicates
    def remove_duplicates(self):
        """Remove duplicate entries."""
        display(Markdown("***Step 2: Checking for duplicate entries...***\n"))
        duplicate_count = self.df.duplicated().sum()
        if duplicate_count > 0:
            print(f"\nFound {duplicate_count} duplicate rows. Removing duplicates.\n")
            self.df.drop_duplicates(inplace=True)
        else:
            print("\nNo duplicate rows found.\n")

        print("\n---------------------------------------\n")  # Separator for clarity
        return self.df

    #Drop irrelevant columns
    def drop_irrelevant_columns(self):
        """Drop specified irrelevant columns based on user input."""

        display(Markdown("\n***Step 3: Dropping specified irrelevant columns...***\n"))

        # Drop customerID column if it exists
        if 'customerID' in self.df.columns:
            print("\nDropping column 'customerID'.\n")
            self.df.drop(columns=['customerID'], inplace=True)
            print("Column 'customerID' has been dropped successfully.")
        else:
            print("\nColumn 'customerID' not found; skipping.\n")

        print("\n---------------------------------------\n")  # Separator for clarity
        return self.df


    def convert_data_types(self):
       """Convert columns to their appropriate data types, especially object columns to numeric where possible."""
       display(Markdown("\n***Step 4: Checking datatypes of columns and converting eligible ones...***\n"))

       for column in self.df.columns:
         print(f"\nChecking column: '{column}' (current type: {self.df[column].dtype})")

         if self.df[column].dtype == 'object':
            print(f"\nColumn '{column}' is of object type. Attempting to convert to numeric...")

            # Strip spaces and other non-numeric characters if necessary
            self.df[column] = self.df[column].str.strip()

            # Try converting to numeric
            converted_column = pd.to_numeric(self.df[column], errors='coerce')

            # Check if the conversion led to non-null numeric values
            if converted_column.notnull().sum() > 0:
                print(f"\nSuccess: Converted '{column}' to numeric.")
                self.df[column] = converted_column
            else:
                print(f"\nSkipped: '{column}' contains non-numeric values and was not converted.")
         else:
            print(f"\nColumn '{column}' is already a non-object type. No conversion needed.")

         print("\n********************************************")

       print("\n----------------------------------\n")
       return self.df


    def check_outliers(self, method='remove'):
     """Check for outliers in numerical columns using Z-score or IQR."""
     display(Markdown("\n***Step 5: Checking for outliers...***\n"))

     numeric_cols = self.df.select_dtypes(include=['int64', 'float64']).columns
     outlier_info = {}

     for column in numeric_cols:
        Q1 = self.df[column].quantile(0.25)
        Q3 = self.df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = ((self.df[column] < lower_bound) | (self.df[column] > upper_bound)).sum()
        outlier_info[column] = outliers

        if outliers > 0:
            print(f"\nColumn '{column}' has {outliers} outliers.\n")
            if method == 'remove':
                self.df = self.df[(self.df[column] >= lower_bound) & (self.df[column] <= upper_bound)]
            elif method == 'cap':
                self.df.loc[self.df[column] > upper_bound, column] = upper_bound
                self.df.loc[self.df[column] < lower_bound, column] = lower_bound
        else:
            print(f"\nColumn '{column}' has no outliers.\n")

     print("\n---------------------------------------\n")  # Separator for clarity
     return self.df



    def preprocess(self, strategy='mean', threshold=30):
        """Run all preprocessing steps."""
        display(Markdown("\n**Starting preprocessing steps...**\n"))

        # Step 1: Handle missing values
        self.handle_missing_values(strategy=strategy, threshold=threshold)

        # Step 2: Remove duplicates
        self.remove_duplicates()

        # Step 3: Drop irrelevant columns
        self.drop_irrelevant_columns()

        #Step 4: Convert DataType
        self.convert_data_types()

        # Step 5: Check for outliers
        self.check_outliers()

        # Step 6: Check for class imbalance and apply SMOTE if necessary
        #self.check_imbalance()

        print("\nPreprocessing completed.\n")
        return self.df

In [None]:
#Example usage:

if __name__ == "__main__":
    # Dataset URL
    dataset_url = "https://raw.githubusercontent.com/Suryansh-gp/DEVA_AI/main/Telecom_Churn.csv"

    # Role, Goal, and Backstory for the Data Preprocessing Agent
    role = "Data Preprocessing Agent"
    goal = "Preprocess the telecom customer churn dataset."
    backstory = "This agent cleans and prepares the dataset for analysis."

    telco_data = pd.read_csv(dataset_url)

    agent = DataPreprocessingAgent(telco_data)

    # Run preprocessing
    processed_data = agent.preprocess(strategy='mean', threshold=30)

    display(processed_data.head())



**Starting preprocessing steps...**



***Step 1: Handling missing values...***



No missing values found.


---------------------------------------



***Step 2: Checking for duplicate entries...***



No duplicate rows found.


---------------------------------------




***Step 3: Dropping specified irrelevant columns...***



Dropping column 'customerID'.

Column 'customerID' has been dropped successfully.

---------------------------------------




***Step 4: Checking datatypes of columns and converting eligible ones...***



Checking column: 'gender' (current type: object)

Column 'gender' is of object type. Attempting to convert to numeric...

Skipped: 'gender' contains non-numeric values and was not converted.

********************************************

Checking column: 'SeniorCitizen' (current type: int64)

Column 'SeniorCitizen' is already a non-object type. No conversion needed.

********************************************

Checking column: 'Partner' (current type: object)

Column 'Partner' is of object type. Attempting to convert to numeric...

Skipped: 'Partner' contains non-numeric values and was not converted.

********************************************

Checking column: 'Dependents' (current type: object)

Column 'Dependents' is of object type. Attempting to convert to numeric...

Skipped: 'Dependents' contains non-numeric values and was not converted.

********************************************

Checking column: 'tenure' (current type: int64)

Column 'tenure' is already a non-object typ


***Step 5: Checking for outliers...***



Column 'SeniorCitizen' has 1142 outliers.


Column 'tenure' has no outliers.


Column 'MonthlyCharges' has no outliers.


Column 'TotalCharges' has 19 outliers.


---------------------------------------


Preprocessing completed.



Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
"""    def check_imbalance(self):
     #Check for class imbalance in the target variable and apply SMOTE if necessary.
     display(Markdown("***Step 6: Checking for class imbalance...***\n"))

     if 'Churn' in self.df.columns:  # Ensure correct column name
         churn_counts_before = self.df['Churn'].value_counts()
         print(f"\nClass distribution before SMOTE:\n{churn_counts_before}\n")

         # Plotting the class distribution before SMOTE
         self.plot_class_distribution(churn_counts_before, title='Class Distribution Before SMOTE')

         imbalance_ratio = churn_counts_before.min() / churn_counts_before.max()
         if imbalance_ratio < 0.5:  # Imbalance threshold
            print("Class imbalance detected. Applying SMOTE...\n")

            # Ensure handling of categorical variables before SMOTE
            X = pd.get_dummies(self.df.drop('Churn', axis=1), drop_first=True)  # Handle categorical variables
            y = self.df['Churn']

            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X, y)

            # Combine resampled data back into a DataFrame
            self.df = pd.DataFrame(X_resampled, columns=X.columns)
            self.df['Churn'] = y_resampled

            churn_counts_after = self.df['Churn'].value_counts()
            print("SMOTE applied. New class distribution:\n", churn_counts_after)

            # Plotting the class distribution after SMOTE
            self.plot_class_distribution(churn_counts_after, title='Class Distribution After SMOTE')
         else:
            print("No class imbalance detected.\n")

     else:
        print("The 'Churn' column is not present in the dataset.\n")

     print("\n---------------------------------------\n")  # Separator for clarity
     return self.df


    def plot_class_distribution(self, churn_counts, title):
        #Plot the distribution of classes.
        plt.figure(figsize=(8, 6))
        sns.barplot(x=churn_counts.index.astype(str), y=churn_counts.values, palette='viridis')
        plt.title(title)
        plt.xlabel('Churn Class')
        plt.ylabel('Number of Instances')
        plt.xticks(rotation=0)
        plt.show()
   """