In [None]:
pip install cassandra-driver# 


1. Start Cassandra:
   After installing Cassandra using Homebrew as described in the previous steps, start Cassandra by running the following command in the Terminal:
   ```
   cassandra -f
   ```

2. Connect to Cassandra:
   Open a new Terminal window and run the following command to connect to the Cassandra database:
   ```
   cqlsh
   ```

3. Create a keyspace:
   In Cassandra, data is organized into keyspaces, which are similar to databases in traditional relational databases. Let's create a keyspace named "my_keyspace":
   ```
   CREATE KEYSPACE my_keyspace WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
   ```

4. Use the keyspace:
   After creating the keyspace, you need to use it before creating or accessing any tables within it:
   ```
   USE my_keyspace;
   ```

5. Create a table:
   Now, let's create a simple table named "users" with columns for user_id, name, and age:
   ```
   CREATE TABLE users (
       user_id UUID PRIMARY KEY,
       name text,
       age int
   );
   ```

6. Insert data into the table:
   You can insert data into the "users" table as follows:
   ```
   INSERT INTO users (user_id, name, age) VALUES (uuid(), 'John Doe', 30);
   INSERT INTO users (user_id, name, age) VALUES (uuid(), 'Jane Smith', 28);
   ```

7. Query data from the table:
   You can query data from the "users" table using CQL SELECT statements:
   ```
   SELECT * FROM users;
   ```
   This will retrieve all the rows from the "users" table.

8. Update data in the table:
   To update data in the "users" table, you can use the UPDATE statement:
   ```
   UPDATE users SET age = 31 WHERE name = 'John Doe';
   ```

9. Delete data from the table:
   To delete data from the "users" table, you can use the DELETE statement:
   ```
   DELETE FROM users WHERE name = 'Jane Smith';
   ```

In [11]:
!nodetool status

Datacenter: datacenter1
Status=Up/Down
|/ State=Normal/Leaving/Joining/Moving
--  Address    Load       Tokens  Owns (effective)  Host ID                               Rack 
UN  127.0.0.1  93,12 KiB  16      100,0%            888b83ac-1409-48f7-93e2-a93cea981513  rack1



In [9]:
from cassandra.cluster import Cluster
import random
import uuid


# Connect to Cassandra cluster and create a session
cluster = Cluster(['127.0.0.1'])
session = cluster.connect('my_keyspace')

# Create a table to store the synthetic dataset
session.execute(
    """
    CREATE TABLE IF NOT EXISTS user_data (
        user_id UUID PRIMARY KEY,
        age INT,
        income FLOAT,
        label INT
    )
    """
)

# Generate synthetic data and insert it into Cassandra
for i in range(100):
    user_id = uuid.uuid4()
    age = random.randint(20, 60)
    income = random.uniform(20000, 100000)
    label = random.randint(0, 1)
    
    session.execute(
        """
        INSERT INTO user_data (user_id, age, income, label)
        VALUES (%s, %s, %s, %s)
        """,
        (user_id, age, income, label)
    )


In [15]:
# Generate synthetic data and insert it into Cassandra
for i in range(100):
    user_id = uuid.uuid4()
    age = random.randint(20, 60)
    income = random.uniform(20000, 100000)
    label = random.randint(0, 1)
    
    session.execute(
        """
        INSERT INTO user_data (user_id, age, income, label)
        VALUES (%s, %s, %s, %s)
        """,
        (user_id, age, income, label)
    )

In [None]:
# Accessing Data from Cassanda table

In [17]:
from cassandra.cluster import Cluster
import pandas as pd

# Connect to the Cassandra cluster
#cluster = Cluster(['your_cassandra_node_ip'])
session = cluster.connect()

# Set the keyspace and table
keyspace_name = 'my_keyspace'
table_name = 'user_data'
session.set_keyspace(keyspace_name)

# Execute the SELECT query
query = f"SELECT * FROM {table_name};"
result_set = session.execute(query)

# Convert query results to pandas DataFrame
df = pd.DataFrame(result_set)

# Display the table
df


Unnamed: 0,user_id,age,income,label
0,d940fd3c-762e-4184-bd5a-4d160ae9c7e4,24,94255.703125,1
1,654352eb-9cf9-4742-ad51-5fb20b1553d5,38,67063.031250,0
2,4a25e398-3263-4bcc-ac7c-bf2d318e77ac,31,27901.945312,0
3,7e7c57de-488e-44c8-8e90-2a0e279f0c0d,27,54987.843750,1
4,4034e3a1-6d28-41a4-8dca-654b992b16fd,52,34821.851562,0
...,...,...,...,...
195,7d67c121-7147-42db-9d1b-704538084cd9,50,70910.406250,0
196,172c0eaf-000d-4b70-bdac-7efd82160b3e,34,80067.406250,0
197,ec5efea5-f0d0-442c-be29-22c85ed60a5e,26,20688.388672,1
198,7d851bb6-de5c-4430-a394-444d8e00dcc7,52,34570.730469,1


In [21]:
from cassandra.cluster import Cluster
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Connect to Cassandra cluster and create a session
#cluster = Cluster(['your_cassandra_host'])
#session = cluster.connect('your_keyspace')

# Retrieve data from Cassandra
query = "SELECT age, income, label FROM user_data"
rows = session.execute(query)
data = []
for row in rows:
    data.append(row)

# Convert data to a pandas DataFrame
df = pd.DataFrame(data, columns=['age', 'income', 'label'])

# Split data into features and target variable
X = df[['age', 'income']]
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12)

# Build a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.55


In [19]:
df 

Unnamed: 0,age,income,label
0,24,94255.703125,1
1,38,67063.031250,0
2,31,27901.945312,0
3,27,54987.843750,1
4,52,34821.851562,0
...,...,...,...
195,50,70910.406250,0
196,34,80067.406250,0
197,26,20688.388672,1
198,52,34570.730469,1
