In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv('AIDS_Classification.csv')

In [3]:
print(df.columns)

Index(['time', 'trt', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof',
       'oprior', 'z30', 'preanti', 'race', 'gender', 'str2', 'strat',
       'symptom', 'treat', 'offtrt', 'cd40', 'cd420', 'cd80', 'cd820',
       'infected'],
      dtype='object')


In [4]:
print(df.shape)

(50000, 23)


In [5]:
# Explore the data
print(df.head())

   time  trt  age       wtkg  hemo  homo  drugs  karnof  oprior  z30  ...  \
0  1073    1   37   79.46339     0     1      0     100       0    1  ...   
1   324    0   33   73.02314     0     1      0      90       0    1  ...   
2   495    1   43   69.47793     0     1      0     100       0    1  ...   
3  1201    3   42   89.15934     0     1      0     100       1    1  ...   
4   934    0   37  137.46581     0     1      0     100       0    0  ...   

   str2  strat  symptom  treat  offtrt  cd40  cd420  cd80  cd820  infected  
0     1      2        0      1       0   322    469   882    754         1  
1     1      3        1      1       1   168    575  1035   1525         1  
2     1      1        0      0       0   377    333  1147   1088         1  
3     1      3        0      0       0   238    324   775   1019         1  
4     0      3        0      0       1   500    443  1601    849         0  

[5 rows x 23 columns]


In [6]:
# Explore the data
print(df.tail())

       time  trt  age      wtkg  hemo  homo  drugs  karnof  oprior  z30  ...  \
49995   953    3   46  61.28204     0     0      0      90       0    1  ...   
49996  1036    0   42  73.36768     0     1      0     100       0    1  ...   
49997  1157    0   40  78.75824     0     1      0     100       0    1  ...   
49998   596    0   31  52.20371     0     0      0     100       0    1  ...   
49999   612    2   41  77.12100     0     1      0      90       0    1  ...   

       str2  strat  symptom  treat  offtrt  cd40  cd420  cd80  cd820  infected  
49995     1      3        0      1       1   234    402   481   1014         0  
49996     1      3        0      0       1   369    575   514    657         0  
49997     1      1        0      1       0   308    663  1581    863         0  
49998     1      1        0      1       1   349    440   470    865         1  
49999     1      3        0      1       0   428    396  1002    696         0  

[5 rows x 23 columns]


In [7]:
print(df.describe())

               time           trt           age          wtkg          hemo  \
count  50000.000000  50000.000000  50000.000000  50000.000000  50000.000000   
mean     877.369780      1.384800     34.164020     75.861991      0.033480   
std      307.288688      1.233272      7.091152     12.028730      0.179888   
min       66.000000      0.000000     12.000000     42.361620      0.000000   
25%      542.000000      0.000000     29.000000     68.253682      0.000000   
50%     1045.000000      1.000000     34.000000     74.054115      0.000000   
75%     1136.000000      3.000000     39.000000     81.142185      0.000000   
max     1231.000000      3.000000     68.000000    149.830870      1.000000   

               homo         drugs        karnof        oprior           z30  \
count  50000.000000  50000.000000  50000.000000  50000.000000  50000.000000   
mean       0.653540      0.132220     96.831560      0.042300      0.640880   
std        0.475847      0.338733      5.091788    

In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   time      50000 non-null  int64  
 1   trt       50000 non-null  int64  
 2   age       50000 non-null  int64  
 3   wtkg      50000 non-null  float64
 4   hemo      50000 non-null  int64  
 5   homo      50000 non-null  int64  
 6   drugs     50000 non-null  int64  
 7   karnof    50000 non-null  int64  
 8   oprior    50000 non-null  int64  
 9   z30       50000 non-null  int64  
 10  preanti   50000 non-null  int64  
 11  race      50000 non-null  int64  
 12  gender    50000 non-null  int64  
 13  str2      50000 non-null  int64  
 14  strat     50000 non-null  int64  
 15  symptom   50000 non-null  int64  
 16  treat     50000 non-null  int64  
 17  offtrt    50000 non-null  int64  
 18  cd40      50000 non-null  int64  
 19  cd420     50000 non-null  int64  
 20  cd80      50000 non-null  in

In [9]:
# Count duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()

# Print the count of duplicate rows
print("Duplicate Rows:", duplicate_count)

# Drop duplicates
df.drop_duplicates(inplace=True)

Duplicate Rows: 0


In [10]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
df.dropna(inplace=True)


Missing Values:
time        0
trt         0
age         0
wtkg        0
hemo        0
homo        0
drugs       0
karnof      0
oprior      0
z30         0
preanti     0
race        0
gender      0
str2        0
strat       0
symptom     0
treat       0
offtrt      0
cd40        0
cd420       0
cd80        0
cd820       0
infected    0
dtype: int64


In [11]:
# Save the cleaned dataset to a new CSV file
df.to_csv('Cleaned_incomvseducation.csv', index=False)

In [13]:
# Summary report
print("\nSummary Report:")
print("Original dataset size:", len(pd.read_csv('AIDS_Classification.csv')))
print("Cleaned dataset size:", len(df))


Summary Report:
Original dataset size: 50000
Cleaned dataset size: 50000


In [14]:
# Encode categorical variables
encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

In [15]:
# Define features (X) and target variable (y)
X = df.drop('infected', axis=1)  # Features
y = df['infected']  # Target variable

In [17]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Define preprocessing steps for numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns 

In [20]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Train Logistic Regression model

In [23]:
# Model 1: LogisticRegression
logreg_model = LogisticRegression(random_state=42)

In [24]:
logreg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('model', logreg_model)])

In [25]:
logreg_pipeline.fit(X_train, y_train)

In [26]:
y_pred_logreg = logreg_pipeline.predict(X_test)

In [27]:
# Evaluate accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Test Accuracy:", accuracy_logreg)

Logistic Regression Test Accuracy: 0.7043


# Train K-Nearest Neighbors (KNN) model

In [28]:
knn_model = KNeighborsClassifier()

In [29]:
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', knn_model)])

In [30]:
knn_pipeline.fit(X_train, y_train)

In [31]:
y_pred_knn = knn_pipeline.predict(X_test)

In [33]:
accuracy_knn = accuracy_score(y_test, y_pred_knn)

In [34]:
print("K-Nearest Neighbors Test Accuracy:", accuracy_knn)

K-Nearest Neighbors Test Accuracy: 0.6532


In [35]:
# Perform 10-fold cross-validation
logreg_scores = cross_val_score(logreg_pipeline, X, y, cv=10, scoring='accuracy')
knn_scores = cross_val_score(knn_pipeline, X, y, cv=10, scoring='accuracy')

# Model Comparison

In [36]:
print("Model Comparison for Accuracy")
print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print(f"K-Nearest Neighbors (KNN) Accuracy: {accuracy_knn}")

Model Comparison for Accuracy
Logistic Regression Accuracy: 0.7043
K-Nearest Neighbors (KNN) Accuracy: 0.6532


From these results, it's evident that the Logistic Regression model outperforms the K-Nearest Neighbors (KNN) model in terms of accuracy for this particular classification task. 
The Logistic Regression model achieves an accuracy of around 50.05%, while the KNN model achieves an accuracy of approximately 31.47%.

In [38]:
print("Model Comparison (10-Fold Cross-Validation):")
print(f"Logistic Regression Cross-Validation Mean Accuracy {logreg_scores.mean()}")
print(f"K-Nearest Neighbors (KNN) Cross-Validation Mean Accuracy: {knn_scores.mean()}")

Model Comparison (10-Fold Cross-Validation):
Logistic Regression Cross-Validation Mean Accuracy 0.7070000000000001
K-Nearest Neighbors (KNN) Cross-Validation Mean Accuracy: 0.6607800000000001


In [39]:
# Create two sample records (similar to the previous example)
sample_records = {'time': [1000, 800],
                  'trt': [2, 0],
                  'age': [40, 35],
                  'wtkg': [70, 65],
                  'hemo': [0, 1],
                  'homo': [1, 0],
                  'drugs': [0, 1],
                  'karnof': [90, 95],
                  'oprior': [0, 1],
                  'z30': [1, 0],
                  'preanti': [30, 60],
                  'race': [0, 1],
                  'gender': [1, 0],
                  'str2': [1, 0],
                  'strat': [3, 1],
                  'symptom': [0, 1],
                  'treat': [1, 0],
                  'offtrt': [0, 1],
                  'cd40': [400, 300],
                  'cd420': [500, 400],
                  'cd80': [900, 700],
                  'cd820': [700, 600]}

df_sample = pd.DataFrame(sample_records)

In [42]:
# Make predictions on sample data
y_pred_logreg_sample = logreg_pipeline.predict(df_sample)
y_pred_knn_sample = knn_pipeline.predict(df_sample)

In [43]:
# Print predictions
print("Logistics Regression Predictions (Sample):", y_pred_logreg_sample)
print("Knn Predictions (Sample):", y_pred_knn_sample)

Logistics Regression Predictions (Sample): [0 0]
Knn Predictions (Sample): [0 1]


This means that the Logistic Regression model predicted both samples as not infected (0), while the KNN model predicted the first sample as not infected (0) and the second sample as infected (1).