In [1]:
import pandas as pd

# Define column names
column_names = ["id","clump","cellSize","cellShape","ad","sCellSize","nuclei","chromatin","nNuc","mitosis","class"]  # Replace with actual column names

# Read .data file into a DataFrame
data = pd.read_csv("bd.data", header=None, names=column_names)  # Replace "your_file.data" with the path to your .data file
data.head()

Unnamed: 0,id,clump,cellSize,cellShape,ad,sCellSize,nuclei,chromatin,nNuc,mitosis,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [2]:
# Check the data types of each column
data_types = data.dtypes

# Print the data types
print(data_types)

id            int64
clump         int64
cellSize      int64
cellShape     int64
ad            int64
sCellSize     int64
nuclei       object
chromatin     int64
nNuc          int64
mitosis       int64
class         int64
dtype: object


In [3]:
# Assuming 'column_name' is the name of the column where you want to identify non-integer values

# Filter rows where the specified column contains non-integer values
non_integer_rows = data[~data['nuclei'].astype(str).str.isdigit()]

# Print the rows with non-integer values in the specified column
print("Rows with non-integer values in 'column_name':")
print(non_integer_rows)

Rows with non-integer values in 'column_name':
          id  clump  cellSize  cellShape  ad  sCellSize nuclei  chromatin  \
23   1057013      8         4          5   1          2      ?          7   
40   1096800      6         6          6   9          6      ?          7   
139  1183246      1         1          1   1          1      ?          2   
145  1184840      1         1          3   1          2      ?          2   
158  1193683      1         1          2   1          3      ?          1   
164  1197510      5         1          1   1          2      ?          3   
235  1241232      3         1          4   1          2      ?          3   
249   169356      3         1          1   1          2      ?          3   
275   432809      3         1          3   1          2      ?          2   
292   563649      8         8          8   1          2      ?          6   
294   606140      1         1          1   1          2      ?          2   
297    61634      5         4

In [4]:
# Assuming 'column_name' is the name of the column where you want to keep only integer values

# Filter rows where the specified column contains non-integer values
non_integer_rows = data[~data['nuclei'].astype(str).str.isdigit()]

# Drop the filtered rows from the DataFrame
data = data.drop(non_integer_rows.index)

# Reset the index of the DataFrame
data.reset_index(drop=True, inplace=True)

# Print the cleaned DataFrame
print("Cleaned DataFrame:")
print(data)

Cleaned DataFrame:
          id  clump  cellSize  cellShape  ad  sCellSize nuclei  chromatin  \
0    1000025      5         1          1   1          2      1          3   
1    1002945      5         4          4   5          7     10          3   
2    1015425      3         1          1   1          2      2          3   
3    1016277      6         8          8   1          3      4          3   
4    1017023      4         1          1   3          2      1          3   
..       ...    ...       ...        ...  ..        ...    ...        ...   
678   776715      3         1          1   1          3      2          1   
679   841769      2         1          1   1          2      1          1   
680   888820      5        10         10   3          7      3          8   
681   897471      4         8          6   4          3      4         10   
682   897471      4         8          8   5          4      5         10   

     nNuc  mitosis  class  
0       1        1      2  


In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix


# Split features and target variable
X = data.drop(columns=['id','class'])
y = data['class']

# Split the dataset into train and test sets (25% for test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Split the dataset into train and test sets (25% for test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize and fit Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print("Naïve Bayes Model:")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)


Naïve Bayes Model:
Accuracy: 0.9649122807017544
Confusion Matrix:
 [[100   3]
 [  3  65]]


In [7]:
import pickle
with open("nb_model.pkl", "wb") as file:
    pickle.dump(nb_model, file)