In [4]:
import pandas as pd
from sklearn.neighbors import KernelDensity
 
# Load the dataset from the CSV file
df = pd.read_csv('lab_4.csv')
 
# Select features and target variable
features = df[['age', 'income', 'student', 'credit_rating']]
target = df['buys_computer']
 
# Encode categorical variables
features_encoded = pd.get_dummies(features)
 
# Create a dictionary to store class conditional densities
class_conditional_densities = {}
 
# Calculate class conditional densities for each feature
for class_label in target.unique():
    # Select instances for the current class
    instances = features_encoded[target == class_label]
    
    # Calculate kernel density estimate for each feature
    for feature in features_encoded.columns:
        kde = KernelDensity(bandwidth=0.5)  # You may need to adjust the bandwidth
        kde.fit(instances[[feature]])
        class_conditional_densities[(class_label, feature)] = kde
 
# Display the class conditional densities
for key, kde in class_conditional_densities.items():
    print(f"Class: {key[0]}, Feature: {key[1]}, Density: {kde.score_samples(features_encoded[[key[1]]])}")
 
# Check for zero values
zero_densities = [(class_label, feature) for (class_label, feature), kde in class_conditional_densities.items()
                  if any(density == float('-inf') for density in kde.score_samples(features_encoded[[feature]]))]
 
# Display features and classes with zero densities
print("\nFeatures and Classes with Zero Densities:")
print(zero_densities)

Class: no, Feature: age_31...40, Density: [-0.22579135 -0.22579135 -2.22579135 -0.22579135 -0.22579135 -0.22579135
 -2.22579135 -0.22579135 -0.22579135 -0.22579135 -0.22579135 -2.22579135
 -2.22579135 -0.22579135]
Class: no, Feature: age_<=30, Density: [-0.65023423 -0.65023423 -0.95726122 -0.95726122 -0.95726122 -0.95726122
 -0.95726122 -0.65023423 -0.65023423 -0.95726122 -0.65023423 -0.95726122
 -0.95726122 -0.95726122]
Class: no, Feature: age_>40, Density: [-0.65023423 -0.65023423 -0.65023423 -0.95726122 -0.95726122 -0.95726122
 -0.65023423 -0.65023423 -0.65023423 -0.95726122 -0.65023423 -0.65023423
 -0.65023423 -0.95726122]
Class: no, Feature: income_high, Density: [-0.95726122 -0.95726122 -0.95726122 -0.65023423 -0.65023423 -0.65023423
 -0.65023423 -0.65023423 -0.65023423 -0.65023423 -0.65023423 -0.65023423
 -0.95726122 -0.65023423]
Class: no, Feature: income_low, Density: [-0.41566086 -0.41566086 -0.41566086 -0.41566086 -1.40257636 -1.40257636
 -1.40257636 -0.41566086 -1.40257636 

In [8]:
import pandas as pd

# Read the data from the CSV file
data = pd.read_csv('lab_4.csv')

# Count the number of occurrences of each class
class_counts = data['age'].value_counts()

# Calculate the prior probability for each class
prior_probabilities = class_counts / len(data)

# Print the prior probabilities
print(prior_probabilities)

age
<=30       0.357143
>40        0.357143
31...40    0.285714
Name: count, dtype: float64


In [10]:
import pandas as pd

# Read the data from the CSV file
data = pd.read_csv('lab_4.csv')

# Count the number of occurrences of each class
class_counts = data['income'].value_counts()

# Calculate the prior probability for each class
prior_probabilities = class_counts / len(data)

# Print the prior probabilities
print(prior_probabilities)

income
medium    0.428571
high      0.285714
low       0.285714
Name: count, dtype: float64


In [12]:
import pandas as pd

# Read the data from the CSV file
data = pd.read_csv('lab_4.csv')

# Count the number of occurrences of each class
class_counts = data['student'].value_counts()

# Calculate the prior probability for each class
prior_probabilities = class_counts / len(data)

# Print the prior probabilities
print(prior_probabilities)

student
no     0.5
yes    0.5
Name: count, dtype: float64


In [14]:
import pandas as pd

# Read the data from the CSV file
data = pd.read_csv('lab_4.csv')

# Count the number of occurrences of each class
class_counts = data['credit_rating'].value_counts()

# Calculate the prior probability for each class
prior_probabilities = class_counts / len(data)

# Print the prior probabilities
print(prior_probabilities)

credit_rating
fair         0.571429
excellent    0.428571
Name: count, dtype: float64


In [16]:
import pandas as pd

# Read the data from the CSV file
data = pd.read_csv('lab_4.csv')

# Count the number of occurrences of each class
class_counts = data['buys_computer'].value_counts()

# Calculate the prior probability for each class
prior_probabilities = class_counts / len(data)

# Print the prior probabilities
print(prior_probabilities)

buys_computer
yes    0.642857
no     0.357143
Name: count, dtype: float64


In [18]:
import pandas as pd
from scipy.stats import chi2_contingency

# Read the data from the CSV file
data = pd.read_csv('lab_4.csv')

# Create a contingency table for the four features
contingency_table = pd.crosstab(data['age'], [data['income'], data['student'], data['credit_rating']])

# Perform the chi-square test of independence
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Print the p-value
print("p-value:", p_value)

p-value: 0.6764100579553458


In [9]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('tips.csv')

# Rest of your code
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Step 2: Prepare the data
X = df['sex']
y = pd.cut(df['tip'], bins=[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]).astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Step 4: Train the Naïve-Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_vectorized.toarray(), y_train)

# Step 5: Evaluate the classifier
y_pred = nb_classifier.predict(X_test_vectorized.toarray())
accuracy = (y_pred == y_test).mean()
print("Accuracy:", accuracy)

Accuracy: 0.9795918367346939


In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
df = pd.read_csv('lab_4.csv')

# Step 2: Prepare the data
X = df['income']
y = df['student']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Step 5: Train the Naïve-Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Step 6: Evaluate the classifier
accuracy = nb_classifier.score(X_test_vectorized, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.6666666666666666
