In [1]:
# import required libraries
!pip install pymongo
import pandas as pd
import json



In [2]:
# import pymongo which creates the connection between python and mongoDB
import pymongo

# 1. Inserting CSV file into the database

In [3]:
# creating a client for pymongo
client = pymongo.MongoClient("mongodb://localhost:27017")

In [4]:
# importing our CSV file into a pandas dataframe
df = pd.read_csv("C:\data\db\colon.csv")

In [5]:
# first 5 rows
df.head()

Unnamed: 0,id,study,rx,sex,age,obstruct,perfor,adhere,nodes,status,differ,extent,surg,node4,time
0,1,1,3,1,43,0,0,0,5.0,1,2.0,3,0,1,1521
1,2,1,3,1,63,0,0,0,1.0,0,2.0,3,0,0,3087
2,3,1,1,0,71,0,0,1,7.0,1,2.0,2,0,1,963
3,4,1,3,0,66,1,0,0,6.0,1,2.0,3,1,1,293
4,5,1,1,1,69,0,0,0,22.0,1,2.0,3,1,1,659


In [23]:
# shape of the dataframe
df.shape

(929, 15)

In [24]:
df.isna().sum()

id           0
study        0
rx           0
sex          0
age          0
obstruct     0
perfor       0
adhere       0
nodes       18
status       0
differ      23
extent       0
surg         0
node4        0
time         0
dtype: int64

In [27]:
df=df.dropna()

In [28]:
df.shape

(888, 15)

In [7]:
# converting the dataframe into dictionary (JSON like structure)
data = df.to_dict(orient="records")

In [8]:
# first two rows of the dictionary
data[0:2]

[{'id': 1,
  'study': 1,
  'rx': 3,
  'sex': 1,
  'age': 43,
  'obstruct': 0,
  'perfor': 0,
  'adhere': 0,
  'nodes': 5.0,
  'status': 1,
  'differ': 2.0,
  'extent': 3,
  'surg': 0,
  'node4': 1,
  'time': 1521},
 {'id': 2,
  'study': 1,
  'rx': 3,
  'sex': 1,
  'age': 63,
  'obstruct': 0,
  'perfor': 0,
  'adhere': 0,
  'nodes': 1.0,
  'status': 0,
  'differ': 2.0,
  'extent': 3,
  'surg': 0,
  'node4': 0,
  'time': 3087}]

In [9]:
# creating a new database "db"
db = client["colon2"]

In [10]:
# printing details of the database
print(db)

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'colon2')


In [11]:
# inserting the dictionary into collection "colonCancer" in the database "db"
db.colonCancer.insert_many(data)

<pymongo.results.InsertManyResult at 0x222d1a2ac80>

# 2. Reading documents from database

In [12]:
# creating collection object to read "colonCancer" collection from database "db"
myCollection = db.get_collection("colonCancer")

In [13]:
# details of the collection object
print(myCollection)

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'colon2'), 'colonCancer')


In [14]:
# reading all documents at a time
# it returns a iterable cursor
allRecords = myCollection.find()

In [15]:
# converting the cursor into a list
# list is easy to operate
listCursor = list(allRecords)

In [16]:
# converting the list into pandas dataframe
df2 = pd.DataFrame(listCursor)

In [17]:
# first five rows of the dataframe
df2.head()

Unnamed: 0,_id,id,study,rx,sex,age,obstruct,perfor,adhere,nodes,status,differ,extent,surg,node4,time
0,628f8e610e3eb4af7502e793,1,1,3,1,43,0,0,0,5.0,1,2.0,3,0,1,1521
1,628f8e610e3eb4af7502e794,2,1,3,1,63,0,0,0,1.0,0,2.0,3,0,0,3087
2,628f8e610e3eb4af7502e795,3,1,1,0,71,0,0,1,7.0,1,2.0,2,0,1,963
3,628f8e610e3eb4af7502e796,4,1,3,0,66,1,0,0,6.0,1,2.0,3,1,1,293
4,628f8e610e3eb4af7502e797,5,1,1,1,69,0,0,0,22.0,1,2.0,3,1,1,659


### ***Organize Our Data***

In [18]:
df2=df2.drop(columns=['_id', 'id', 'study'])
# drop rows containing null values
df2=df2.dropna()
df2.head()

Unnamed: 0,rx,sex,age,obstruct,perfor,adhere,nodes,status,differ,extent,surg,node4,time
0,3,1,43,0,0,0,5.0,1,2.0,3,0,1,1521
1,3,1,63,0,0,0,1.0,0,2.0,3,0,0,3087
2,1,0,71,0,0,1,7.0,1,2.0,2,0,1,963
3,3,0,66,1,0,0,6.0,1,2.0,3,1,1,293
4,1,1,69,0,0,0,22.0,1,2.0,3,1,1,659


### ***import Cox regression library:***

In [19]:
from lifelines import CoxPHFitter

### ***Create CPH object:***

In [20]:
cph=CoxPHFitter()

### ***Fit data into CPH object:***

In [21]:
# fitting all columns
cph.fit(df2, "time", event_col="status")
cph.print_summary(columns=["coef","exp(coef)","exp(coef) lower 95%","exp(coef) upper 95%", "z", "p"])

  columns = summary_df.columns & self.columns
  columns = summary_df.columns & self.columns


0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'status'
baseline estimation,breslow
number of observations,2664
number of events observed,1290
partial log-likelihood,-9513.21
time fit was run,2022-05-26 15:08:03 UTC

Unnamed: 0,coef,exp(coef),exp(coef) lower 95%,exp(coef) upper 95%,z,p
rx,-0.17,0.84,0.79,0.9,-5.07,<0.005
sex,0.01,1.01,0.91,1.13,0.2,0.84
age,0.01,1.01,1.0,1.01,3.1,<0.005
obstruct,0.26,1.3,1.13,1.49,3.76,<0.005
perfor,0.03,1.03,0.76,1.4,0.18,0.85
adhere,0.17,1.19,1.02,1.38,2.28,0.02
nodes,0.05,1.05,1.03,1.06,5.12,<0.005
differ,0.12,1.13,1.01,1.27,2.14,0.03
extent,0.45,1.56,1.37,1.79,6.55,<0.005
surg,0.24,1.27,1.12,1.43,3.86,<0.005

0,1
Concordance,0.67
Partial AIC,19048.42
log-likelihood ratio test,413.89 on 11 df
-log2(p) of ll-ratio test,269.62


In [22]:
df2.shape

(2664, 13)