<a href="https://colab.research.google.com/github/shivamsinghtomar78/ML-Projects-/blob/main/tuberculosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import pandas as pd

In [59]:
df = pd.read_csv('/content/tuberculosis.csv')

In [60]:
# Display basic information and first few rows
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Reporting area                                335 non-null    object 
 1   MMWR Year                                     335 non-null    int64  
 2   MMWR Quarter                                  335 non-null    int64  
 3   Tuberculosis†, Current quarter                290 non-null    float64
 4   Tuberculosis†, Current quarter, flag          45 non-null     object 
 5   Tuberculosis†, Previous 4 quarters Min        335 non-null    int64  
 6   Tuberculosis†, Previous 4 quarters Min, flag  0 non-null      float64
 7   Tuberculosis†, Previous 4 quarters Max        335 non-null    int64  
 8   Tuberculosis†, Previous 4 quarters Max, flag  0 non-null      float64
 9   Tuberculosis†, Cum 2018                       317 non-null    flo

(None,
   Reporting area  MMWR Year  MMWR Quarter  Tuberculosis†, Current quarter  \
 0  UNITED STATES       2018             1                          1216.0   
 1    NEW ENGLAND       2018             1                            57.0   
 2  MID. ATLANTIC       2018             1                           271.0   
 3  NEW YORK CITY       2018             1                           157.0   
 4   E.N. CENTRAL       2018             1                           125.0   
 
   Tuberculosis†, Current quarter, flag  \
 0                                  NaN   
 1                                  NaN   
 2                                  NaN   
 3                                  NaN   
 4                                  NaN   
 
    Tuberculosis†, Previous 4 quarters Min  \
 0                                    1216   
 1                                      57   
 2                                     271   
 3                                     150   
 4                               

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [62]:
# Drop unnecessary columns (flags, location data, columns with too many NaNs)
df_cleaned = df.drop(columns=["Tuberculosis†, Current quarter, flag", "Tuberculosis†, Previous 4 quarters Min, flag",
                              "Tuberculosis†, Previous 4 quarters Max, flag", "Tuberculosis†, Cum 2018, flag",
                              "Tuberculosis†, Cum 2017, flag", "Location 1", "Location 2"])


In [63]:
df_cleaned = df_cleaned.dropna(subset=["Tuberculosis†, Current quarter"])

In [64]:
# Encode categorical variable "Reporting area"
encoder = LabelEncoder()
df_cleaned["Reporting area"] = encoder.fit_transform(df_cleaned["Reporting area"])

In [65]:
# Separate features and target variable
X = df_cleaned.drop(columns=["Tuberculosis†, Current quarter"])
y = df_cleaned["Tuberculosis†, Current quarter"]


In [66]:
# Handle missing values in features
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

In [67]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [68]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [69]:
# Train a simple model (Random Forest Regressor for high accuracy)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [70]:
# Predictions
y_pred = model.predict(X_test)

In [71]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [72]:
mae, r2

(13.65379310344827, 0.971677030841127)

In [73]:
import pickle

In [77]:
model_name ="tuberculosis_model.pkl"
with open(model_name, "wb") as file:
    pickle.dump(model, file)