##Q1. Assignment

In [17]:
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [13]:
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the tips dataset
tips = sns.load_dataset('tips')
X = tips.drop('sex', axis=1)
y = tips['sex']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Identify the categorical and numerical columns
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['category', 'object']).columns

# Combine numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Create the final pipeline with preprocessor and Random Forest Classifier
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Predict the target variable on the testing data
y_pred = pipe.predict(X_test)

# Evaluate the model using accuracy and classification report
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(acc))

report = classification_report(y_test, y_pred)
print('Classification Report:\n', report)


Accuracy: 0.61
Classification Report:
               precision    recall  f1-score   support

      Female       0.50      0.37      0.42        19
        Male       0.66      0.77      0.71        30

    accuracy                           0.61        49
   macro avg       0.58      0.57      0.57        49
weighted avg       0.60      0.61      0.60        49



###In this  pipeline, we use the tips dataset and split it into training and testing sets. We create separate pipelines for numerical and categorical features, and combine them using ColumnTransformer. We then use Random Forest Classifier instead of Logistic Regression to build the final model. Finally, we fit the pipeline on the training data, predict the target variable on the testing data, and evaluate the model using accuracy and classification report.

##Here's an  pipeline using the titanic dataset from Seaborn, where we predict the survival of passengers based on their features:

In [16]:
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the titanic dataset
titanic = sns.load_dataset('titanic')
X = titanic.drop('survived', axis=1)
y = titanic['survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create numerical and categorical pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Identify the categorical and numerical columns
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['category', 'object']).columns

# Combine numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Create the final pipeline with preprocessor and Random Forest Classifier
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Predict the target variable on the testing data
y_pred = pipe.predict(X_test)

# Evaluate the model using accuracy and classification report
acc = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(acc))

report = classification_report(y_test, y_pred)
print('Classification Report:\n', report)


Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        74

    accuracy                           1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      1.00      1.00       179



##Q2. Build a pipeline that includes a random forest classifier and a logistic regression classifier, and then use a voting classifier to combine their predictions. Train the pipeline on the iris dataset and evaluate its accuracy.

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier

iris = load_iris()

# create numerical and categorical transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())])

# specify the columns that should be transformed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, [0, 1, 2, 3]),
        ('cat', categorical_transformer, [])])

# create the random forest and logistic regression classifiers
rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(random_state=42)

# create a voting classifier with the random forest and logistic regression classifiers
voting_clf = VotingClassifier(estimators=[('rf', rf), ('lr', lr)], voting='hard')

# create the final pipeline by combining the preprocessor and the voting classifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('voting_clf', voting_clf)])

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# train the pipeline
pipeline.fit(X_train, y_train)

# evaluate the accuracy of the pipeline on the test set
accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0
