# Decision Tree

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv(r'C:\Users\rajes\Downloads\cats_dogs.csv')
df.head()

Unnamed: 0,ID,Description,Animal
0,1,"A small, fluffy animal with pointy ears and a ...",Cat
1,2,A loyal companion with a wagging tail and a bark.,Dog
2,3,An animal that enjoys climbing trees and chasi...,Cat
3,4,Known for its playful behavior and friendly na...,Dog
4,5,A creature that purrs when content and loves t...,Cat


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score


X = df['Description']
y = df['Animal']


pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('select', SelectKBest(chi2, k=100)), 
    ('clf', StackingClassifier(
        estimators=[
            ('dt', DecisionTreeClassifier(random_state=42)),
            ('gb', GradientBoostingClassifier(random_state=42))
        ],
        final_estimator=DecisionTreeClassifier(random_state=42)
    ))
])


param_grid = {
    'vectorizer__max_df': [0.8, 0.9, 1.0],
    'vectorizer__min_df': [1, 2, 3],
    'select__k': [50, 100, 150],
    'clf__final_estimator__max_depth': [None, 10, 20],
    'clf__final_estimator__min_samples_split': [2, 5, 10]
}


grid_search = GridSearchCV(pipeline, param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy')

grid_search.fit(X, y) 


print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.2f}")

best_pipeline = grid_search.best_estimator_

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)



  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'clf__final_estimator__max_depth': None, 'clf__final_estimator__min_samples_split': 2, 'select__k': 50, 'vectorizer__max_df': 1.0, 'vectorizer__min_df': 3}
Best cross-validation score: 0.65




Accuracy on test set: 33.33%


In [7]:

cv_scores = cross_val_score(best_pipeline, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean: {cv_scores.mean():.2f}")




Cross-validation scores: [0.75 0.25 0.5  0.75 1.  ]
Mean: 0.65


In [8]:

new_descriptions = [
    "An animal that loves to sleep and cuddle.",
    "A loyal companion with a wagging tail and a bark.",
    "Known for its playful behavior and friendly"
]
predictions = best_pipeline.predict(new_descriptions)
print(predictions)

['Cat' 'Cat' 'Cat']
