In [8]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [9]:
# Load dataset
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LOQ\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [12]:
# Load IMDB dataset
movie_reviews = pd.read_csv("IMDB Dataset.csv")

# Assuming the dataset has 'review' and 'sentiment' columns
X = movie_reviews['review']
y = movie_reviews['sentiment'].map({'positive': 1, 'negative': 0})  # Convert labels to binary (1, 0)


In [None]:
# Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z]', ' ', text)  # Remove non-alphabetic characters
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]  # Apply stemming
    return ' '.join(words)

X = X.apply(preprocess_text)


In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Bag-of-Words model
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
# Train Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

In [None]:
# Predictions
y_pred = classifier.predict(X_test_vec)
y_prob = classifier.predict_proba(X_test_vec)[:, 1]

In [None]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)
print(f"ROC-AUC Score: {roc_auc:.4f}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Load dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data"

# Define column names based on UCI documentation
columns = ["ID", "Outcome"] + [f"Feature_{i}" for i in range(1, 32)]  # Modify if needed

# Load dataset
data = pd.read_csv(url, header=None, names=columns)

# Drop ID column if not needed
data = data.drop(columns=["ID"])

# Convert Outcome to binary (assuming 'R' and 'N' represent recurrence and non-recurrence)
data['Outcome'] = data['Outcome'].map({'R': 1, 'N': 0})

# Basic EDA
print(data.describe())
print(data.isnull().sum())


In [None]:
# Handle missing values (if any)
data = data.dropna()

# Split features and target
X = data.drop(columns=['Outcome'])  # Using 'Outcome' as the target variable
y = data['Outcome']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Recursive Feature Elimination (RFE)
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X_train, y_train)

# Selected Features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)

# Train model with selected features
X_train_selected = X_train[:, rfe.support_]
X_test_selected = X_test[:, rfe.support_]
model.fit(X_train_selected, y_train)

# Predictions
y_pred = model.predict(X_test_selected)
y_prob = model.predict_proba(X_test_selected)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)
print(f"ROC-AUC Score: {roc_auc:.4f}")

In [None]:
# Visualize feature ranking
plt.figure(figsize=(10,6))
plt.barh(X.columns, rfe.ranking_)
plt.xlabel("Feature Ranking")
plt.ylabel("Features")
plt.title("Feature Selection Using RFE")
plt.show()