In [1]:
#### IMPORTING THE LIBRARIES
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#### 1. FOR MULTICLASS PROBLEM USING 20 NEWS GROUPS DATASET

In [2]:
#### LOADING THE DATASET
categories = ['rec.autos', 'comp.graphics', 'sci.space', 'soc.religion.christian', 'talk.religion.misc']

newsgroups = fetch_20newsgroups(subset = 'all', categories = categories) 


In [3]:
#### FETCHING THE FEATURES AND TARGET
X = newsgroups.data
y = newsgroups.target


In [4]:
#### VECTORIZING THE TEXT DATA INTO THE NUMERICAL DATA 
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5)
X_tfidf = vectorizer.fit_transform(X)
X_tfidf

<4575x5 sparse matrix of type '<class 'numpy.float64'>'
	with 18440 stored elements in Compressed Sparse Row format>

In [5]:
#### CONVERTING TO DATAFRAME
X_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
y_df = pd.DataFrame(y, columns=['Class'])
final_df = pd.concat([X_df, y_df], axis=1)
final_df.head()

Unnamed: 0,com,edu,lines,organization,subject,Class
0,0.72665,0.0,0.389754,0.410482,0.389328,2
1,0.0,0.0,0.567321,0.597492,0.566701,4
2,0.0,0.623963,0.443335,0.466913,0.442851,2
3,0.688031,0.69253,0.123013,0.129555,0.122879,1
4,0.904074,0.0,0.24246,0.255354,0.242195,4


In [6]:
#### SEPARATING THE REGRESSORS AND RESPONSE
X = final_df.drop('Class', axis = 1)
y = final_df['Class']

In [7]:
#### USING STATSMODELS.API OLS METHOD
X = sm.add_constant(X)
model = sm.MNLogit(y, X).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 1.521667
         Iterations 6
                          MNLogit Regression Results                          
Dep. Variable:                  Class   No. Observations:                 4575
Model:                        MNLogit   Df Residuals:                     4551
Method:                           MLE   Df Model:                           20
Date:                Thu, 20 Feb 2025   Pseudo R-squ.:                 0.04654
Time:                        20:48:33   Log-Likelihood:                -6961.6
converged:                       True   LL-Null:                       -7301.4
Covariance Type:            nonrobust   LLR p-value:                4.585e-131
     Class=1       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.1834      0.427     -0.430      0.668      -1.020       0.653
com              1.3107

##### 1. FOR CLASS 1, THE FEATURE 'COM', HAS THE MAXIMUM LIKELIHOOD FOR A DOCUMENT BELONGING TO CLASS 1 AS IT IS POSITIVE VALUED COEFFICIENT
#####    WHEREAS, THE FEATURE 'LINES' HAS THE LEAST LIKELIHOOD OF BELONGING TO CLASS 1 AS IT HAS NEGATIVE COEFFICIENT.
##### 2. FOR CLASS 2, THE FEATURE 'EDU', HAS THE MAXIMUM LIKELIHOOD OF BELONGING TO CLASS 2 AS IT IS POSITIVE VALUED COEFFICIENT
#####    WHEREAS, THE FEATURE 'LINES' HAS THE LEAST LIKELIHOOD OF BELONGING TO CLASS 2 AS IT HAS NEGATIVE COEFFICIENT.
##### 3. FOR CLASS 3, THE FEATURE 'SUBJECT', HAS THE MAXIMUM LIKELIHOOD FOR THE DOCUMENT BELONGING TO CLASS 2 AS IT IS POSITIVE VALUED COEFFICIENT
#####    WHEREAS, THE FEATURE 'ORGANIZATION' HAS THE LEAST LIKELIHOOD OF BELONGING TO CLASS 2 AS IT HAS NEGATIVE COEFFICIENT.
##### 4. FOR CLASS 4, THE FEATURE 'SUBJECT', HAS THE MAXIMUM LIKELIHOOD FOR THE DOCUMENT BELONGING TO CLASS 2 AS IT IS POSITIVE VALUED COEFFICIENT
#####    WHEREAS, THE FEATURE 'LINES' HAS THE LEAST LIKELIHOOD OF BELONGING TO CLASS 2 AS IT HAS NEGATIVE COEFFICIENT.
##### 5. THEREFORE, WORDS LIKE 'COM'(COMPUTERS) AND 'EDU'(EDUCATION), PLAYS A MORE SIGNIFICANT ROLE FOR PREDICTING TO WHICH CLASS A DOCUMENT 
#####    BELONGS TO AS COMPARED TO WORD LIKE 'LINES'.

In [8]:
#### INLINE PREDICTION
model.predict(X)

Unnamed: 0,0,1,2,3,4
0,0.200657,0.234651,0.192123,0.204474,0.168095
1,0.427566,0.145922,0.220920,0.144532,0.061060
2,0.237187,0.140946,0.229616,0.291012,0.101240
3,0.107535,0.271012,0.193578,0.203017,0.224858
4,0.172168,0.320231,0.186991,0.130327,0.190283
...,...,...,...,...,...
4570,0.349215,0.187923,0.185019,0.216613,0.061231
4571,0.474752,0.155047,0.167656,0.174944,0.027600
4572,0.122905,0.263768,0.178958,0.208755,0.225614
4573,0.211427,0.233741,0.271687,0.164987,0.118158


In [9]:
#### DISPLAYING THE INLINE PREDICTED CLASSES
np.argmax(np.array(model.predict(X)), axis = 1) + 1

array([2, 1, 4, ..., 2, 3, 2], dtype=int64)

In [10]:
#### USING SKLEARN.LINEAR_MODEL METHOD
mdl = LogisticRegression(penalty = None, max_iter = 500)
mdl.fit(X, y)
print("ESTIMATED REGRESSION COEFFICIENTS:", mdl.coef_)
print("ESTIMATED BIAS OR INTERCEPT:", mdl.intercept_)

ESTIMATED REGRESSION COEFFICIENTS: [[ 0.8014117  -1.58305775 -1.37466463  1.25572188 -0.27774292 -2.10790347]
 [ 0.6964505  -0.25483741 -0.78183542 -0.98908951  0.15653859 -1.84890163]
 [ 0.39298009 -0.65994306 -0.30724934 -0.19924419  0.23642629 -0.92124949]
 [-1.28725044  1.27077195  1.77657798  1.99268269 -1.81238986  4.22197241]
 [-0.60359185  1.22706627  0.68717141 -2.06007087  1.6971679   0.65608218]]
ESTIMATED BIAS OR INTERCEPT: [ 0.8014117   0.6964505   0.39298009 -1.28725044 -0.60359185]


In [11]:
#### INLINE PREDICTION
pred = mdl.predict(X)
accuracy = accuracy_score(y, pred)
print(accuracy)

0.32633879781420766


#### 2. FOR BINARY CLASS PROBLEM USING MUSHROOM DATA

In [12]:
#### LOADING THE DATASET
mushroom = fetch_openml(name='mushroom', version=1)


In [13]:
#### CONVERTING THE DATA INTO THE PANDAS DATAFRAME
df = mushroom.frame
df.head()


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises%3F,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [14]:
#### SEPARATING THE REGRESSORS AND RESPONSE
X = df.iloc[:, [0,1,2,4,5]]  ## HERE, WE SELECTED ONLY 5 FEATURES FOR FURTHER PROCESS
y = df['class']


In [15]:
#### LABEL ENCODING OF THE FEATURES OR REGRESSORS AND RESPONSE AS THEY ARE CATEGORICAL DATA
label_encoder = LabelEncoder()
X_encoded = X.apply(label_encoder.fit_transform)
y_encoded = label_encoder.fit_transform(y)


In [16]:
#### TRAIN TEST SPLITTING 
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)


In [17]:
#### STANDARDIZATION OF THE FEATURES
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
#### GETTING THE MACHINE READY WITHOUT ANY REGULARIZATION
model = LogisticRegression(penalty = None)
model

In [19]:
#### ESTIMATING THE PARAMETERS
model.fit(X_train_scaled, y_train)
print("ESTIMATED REGRESSION COEFFICIENTS:", model.coef_)
print("ESTIMATED BIAS OR INTERCEPT:", model.intercept_)

ESTIMATED REGRESSION COEFFICIENTS: [[ 0.0812467   0.39198716 -0.16247848 -0.26203141  0.35332738]]
ESTIMATED BIAS OR INTERCEPT: [-0.09024762]


In [20]:
#### INLINE PREDICTIONS
model.predict(X_train_scaled)

array([0, 1, 0, ..., 1, 1, 0])

In [21]:
#### OUTLINE PREDICTIONS
y_pred = model.predict(X_test_scaled)
y_pred

array([0, 0, 1, ..., 1, 0, 0])

In [22]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("TEST ACCURACY OF THE MODEL:", accuracy)
print("CONFUSION_MATRIX:\n",conf_matrix)

TEST ACCURACY OF THE MODEL: 0.5378461538461539
CONFUSION_MATRIX:
 [[474 369]
 [382 400]]


In [23]:
#### USING STATSMODEL.API (OLS) METHOD OF ESTIMATION
X = sm.add_constant(X_train_scaled)
mdl = sm.Logit(y_train, X).fit()
mdl.summary()

Optimization terminated successfully.
         Current function value: 0.657733
         Iterations 6


0,1,2,3
Dep. Variable:,y,No. Observations:,6499.0
Model:,Logit,Df Residuals:,6493.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 20 Feb 2025",Pseudo R-squ.:,0.05023
Time:,20:48:34,Log-Likelihood:,-4274.6
converged:,True,LL-Null:,-4500.7
Covariance Type:,nonrobust,LLR p-value:,1.734e-95

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0903,0.026,-3.464,0.001,-0.141,-0.039
x1,0.0814,0.026,3.139,0.002,0.031,0.132
x2,0.3919,0.026,15.125,0.000,0.341,0.443
x3,-0.1625,0.028,-5.844,0.000,-0.217,-0.108
x4,-0.2620,0.028,-9.468,0.000,-0.316,-0.208
x5,0.3534,0.042,8.399,0.000,0.271,0.436


##### 1. THEREFORE, WE CAN STATE THAT THE ESTIMATED COEFFICIENTS AGREE WITH THE RESULTS OBTAINED FROM THE 
#####    SKLEARN.LINEAR_MODEL.
##### 2. FROM THE ABOVE STATSMODEL.API METHOD WE CAN STATE THAT ALL THESE 5 FEATURES ARE STATISTICALLY 
##### SIGNIFICANT AS THEIR P-VALUES ARE LESS THAN THE THRESHOLD 0.05.
