<a href="https://colab.research.google.com/github/rahul21chavan/Finetuning_Model/blob/main/Gpt2_Finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
pip install torch transformers




In [10]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader

# Define a simple dataset for training
class SimpleDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        tokenizer.pad_token = tokenizer.eos_token  # Set padding token to EOS
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length)

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

In [11]:
# Load pretrained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [13]:
!pip install pandas




In [14]:
import pandas as pd

In [18]:
qa_data = [
    ("proc means data=mydata; run;", "mydata.describe().show()"),
    ("data newdata; set olddata; if age>30; run;", "newdata = olddata.filter(olddata['age'] > 30)"),
    ("proc freq data=mydata; tables gender; run;", "mydata.groupBy('gender').count().show()"),
    ("proc sort data=mydata out=sorted_data; by age; run;", "sorted_data = mydata.orderBy('age')"),
    ("proc print data=mydata; run;", "mydata.show()"),
    ("data mydata; infile 'mydata.csv' dsd; input id name age; run;", "mydata = spark.read.csv('mydata.csv', header=True, inferSchema=True)"),
    ("proc import datafile='mydata.csv' out=mydata dbms=csv replace; run;", "mydata = spark.read.csv('mydata.csv', header=True, inferSchema=True)"),
    ("proc export data=mydata outfile='mydata.csv' dbms=csv replace; run;", "mydata.write.csv('mydata.csv', header=True, mode='overwrite')"),
    ("proc sql; create table newdata as select * from olddata where age>30; quit;", "newdata = spark.sql('SELECT * FROM olddata WHERE age > 30')"),
    ("proc transpose data=mydata out=transposed_data; var age; by id; run;", "transposed_data = mydata.groupBy('id').pivot('age').agg({'age': 'first'})"),
    ("proc univariate data=mydata; var age; run;", "mydata.select('age').summary().show()"),
    ("proc corr data=mydata; var age height; run;", "mydata.corr('age', 'height')"),
    ("proc reg data=mydata; model y=x; run;", "from pyspark.ml.regression import LinearRegression; lr = LinearRegression(featuresCol='x', labelCol='y'); model = lr.fit(mydata)"),
    ("proc logistic data=mydata; model y=x; run;", "from pyspark.ml.classification import LogisticRegression; lr = LogisticRegression(featuresCol='x', labelCol='y'); model = lr.fit(mydata)"),
    ("proc gplot data=mydata; plot y*x; run;", "import matplotlib.pyplot as plt; plt.plot(mydata.select('x').collect(), mydata.select('y').collect())"),
    ("data mydata; length name $20; input id name age; run;", "from pyspark.sql.types import StructType, StructField, IntegerType, StringType; schema = StructType([StructField('id', IntegerType()), StructField('name', StringType()), StructField('age', IntegerType())]); mydata = spark.createDataFrame([], schema)"),
    ("proc format; value agefmt 1-10='Child' 11-18='Teenager' 19-high='Adult'; run;", "from pyspark.sql.functions import when; mydata = mydata.withColumn('age_group', when(mydata['age'].between(1, 10), 'Child').when(mydata['age'].between(11, 18), 'Teenager').otherwise('Adult'))"),
    ("proc summary data=mydata nway; class gender; var age; output out=summary_data mean=mean_age; run;", "summary_data = mydata.groupBy('gender').agg({'age': 'mean'})"),
    ("proc means data=mydata noprint; var age; output out=stats mean=mean_age; run;", "stats = mydata.agg({'age': 'mean'})"),
    ("data mydata; set olddata; rename oldname=newname; run;", "mydata = olddata.withColumnRenamed('oldname', 'newname')"),
    ("data mydata; set olddata (keep=id name); run;", "mydata = olddata.select('id', 'name')"),
    ("data mydata; set olddata (drop=age); run;", "mydata = olddata.drop('age')"),
    ("proc append base=basedata data=newdata; run;", "basedata = basedata.union(newdata)"),
    ("proc merge base=basedata data=newdata; by id; run;", "basedata = basedata.join(newdata, 'id')"),
    ("data mydata; merge basedata newdata; by id; run;", "mydata = basedata.join(newdata, 'id')"),
    ("proc sql; select * from mydata where age between 20 and 30; quit;", "mydata.filter(mydata['age'].between(20, 30)).show()"),
    ("proc sql; select id, name, sum(sales) as total_sales from mydata group by id, name; quit;", "mydata.groupBy('id', 'name').agg({'sales': 'sum'}).withColumnRenamed('sum(sales)', 'total_sales').show()"),
    ("proc sql; select * from mydata order by age desc; quit;", "mydata.orderBy(mydata['age'].desc()).show()"),
    ("data mydata; set olddata; if age=. then age=0; run;", "mydata = olddata.fillna(0, subset=['age'])"),
    ("data mydata; set olddata; length newvar $10; newvar='New Value'; run;", "mydata = olddata.withColumn('newvar', lit('New Value'))"),
    ("proc datasets library=work; delete olddata; run; quit;", "olddata.unpersist()"),
    ("data mydata; set olddata; by group; if first.group then count=1; else count+1; run;", "from pyspark.sql.window import Window; from pyspark.sql.functions import row_number; w = Window.partitionBy('group').orderBy('some_column'); mydata = olddata.withColumn('count', row_number().over(w))"),
    ("proc rank data=mydata out=ranked_data groups=10; var age; ranks rank; run;", "from pyspark.ml.feature import QuantileDiscretizer; qd = QuantileDiscretizer(numBuckets=10, inputCol='age', outputCol='rank'); ranked_data = qd.fit(mydata).transform(mydata)"),
    ("proc freq data=mydata; tables gender*age / chisq; run;", "from pyspark.ml.stat import ChiSquareTest; r = ChiSquareTest.test(mydata, 'age', 'gender').head(); print('pValues: ' + str(r.pValues))"),
    ("proc tabulate data=mydata; class gender; var age; table gender, age*mean; run;", "mydata.groupBy('gender').pivot('age').agg({'age': 'mean'}).show()"),
    ("proc gchart data=mydata; vbar age / type=freq; run;", "mydata.groupBy('age').count().toPandas().plot(kind='bar', x='age', y='count')"),
    ("proc gchart data=mydata; pie gender; run;", "mydata.groupBy('gender').count().toPandas().plot(kind='pie', y='count', labels=mydata.groupBy('gender').count().toPandas()['gender'])"),
    ("proc corr data=mydata; var age height weight; with gender; run;", "mydata.groupBy('gender').agg(corr('age', 'height').alias('age_height_corr'), corr('age', 'weight').alias('age_weight_corr'), corr('height', 'weight').alias('height_weight_corr')).show()"),
    ("proc sql; create index idx_id on mydata(id); quit;", "mydata.createOrReplaceTempView('mydata'); spark.sql('CREATE INDEX idx_id ON mydata(id)')"),
    ("proc sql; drop index idx_id on mydata; quit;", "mydata.createOrReplaceTempView('mydata'); spark.sql('DROP INDEX idx_id ON mydata')"),
    ("data mydata; set olddata; where age>30 and gender='Male'; run;", "mydata = olddata.filter((olddata['age'] > 30) & (olddata['gender'] == 'Male'))"),
    ("data mydata; set olddata; where age in (20, 30, 40); run;", "mydata = olddata.filter(olddata['age'].isin([20, 30, 40]))"),
    ("data mydata; set olddata; where name like 'A%'; run;", "mydata = olddata.filter(olddata['name'].like('A%'))"),
    ("data mydata; set olddata; where name contains 'John'; run;", "mydata = olddata.filter(olddata['name'].contains('John'))"),
    ("data mydata; set olddata; keep id name calculated_age; calculated_age=age+10; run;", "mydata = olddata.select('id', 'name').withColumn('calculated_age', olddata['age'] + 10)"),
    ("data mydata; set olddata; if age>30 then age_group='Old'; else age_group='Young'; run;", "mydata = olddata.withColumn('age_group', when(olddata['age'] > 30, 'Old').otherwise('Young'))"),
    ("data mydata; set olddata; length city $20; city=substr(address, 1, 20); run;", "mydata = olddata.withColumn('city', substring(olddata['address'], 1, 20))"),
    ("data mydata; set olddata; drop address; run;", "mydata = olddata.drop('address')"),
    ("proc sql; select id, name from mydata union select id, name from otherdata; quit;", "mydata.select('id', 'name').union(otherdata.select('id', 'name')).show()"),
    ("proc sql; select id, name from mydata intersect select id, name from otherdata; quit;", "mydata.select('id', 'name').intersect(otherdata.select('id', 'name')).show()"),
    ("proc sql; select id, name from mydata except select id, name from otherdata; quit;", "mydata.select('id', 'name').subtract(otherdata.select('id', 'name')).show()"),
    ("data mydata; set olddata; if _n_=1 then output; run;", "mydata = olddata.limit(1)"),
    ("data mydata; set olddata; if mod(_n_, 2)=0 then output; run;", "mydata = olddata.filter(olddata.rdd.zipWithIndex().map(lambda x: (x[0], x[1] % 2 == 0)).map(lambda x: x[0] if x[1] else None).filter(lambda x: x is not None).toDF(olddata.schema))")

]

dataset = SimpleDataset(qa_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Set model to training mode
model.train()

# Training loop (mock training)
for batch in dataloader:
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]

    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    print(f"Loss: {loss.item()}")  # Print loss for monitoring

# Save the trained model
model.save_pretrained("trained_gpt2_model")
tokenizer.save_pretrained("trained_gpt2_model")  # Save the tokenizer as well

Loss: 6.43989896774292
Loss: 4.31673002243042
Loss: 3.283054828643799
Loss: 2.2036919593811035
Loss: 2.620326280593872
Loss: 2.222964286804199
Loss: 3.5895791053771973
Loss: 2.225044012069702
Loss: 1.8282418251037598
Loss: 1.1461803913116455
Loss: 1.4046096801757812
Loss: 1.296327829360962
Loss: 1.251974105834961
Loss: 1.6013426780700684
Loss: 1.0823968648910522
Loss: 1.8188296556472778
Loss: 1.0448658466339111
Loss: 2.8519859313964844
Loss: 1.5676568746566772
Loss: 1.0785216093063354
Loss: 0.8448755145072937
Loss: 1.483573079109192
Loss: 2.250091552734375
Loss: 2.88724946975708
Loss: 1.2571179866790771
Loss: 1.6168956756591797
Loss: 2.9375269412994385


('trained_gpt2_model/tokenizer_config.json',
 'trained_gpt2_model/special_tokens_map.json',
 'trained_gpt2_model/vocab.json',
 'trained_gpt2_model/merges.txt',
 'trained_gpt2_model/added_tokens.json')

In [20]:
# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("trained_gpt2_model")
tokenizer = GPT2Tokenizer.from_pretrained("trained_gpt2_model")
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [24]:
def ask_question(question):
    input_text = f"Question: {question} Answer:"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=50,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()  # Return the generated answer

In [25]:
print(ask_question("proc means data=mydata; run;"))
# print(ask_question("How does AstroSynth use artificial intelligence?"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No.


In [23]:
# prompt: how to give prompt

# Assuming the code you provided is in a file named 'main.py' and you have already executed it in your Colab environment.

# Get user input
user_question = input("Enter your SAS code: ")

# Call the ask_question function with the user's input
generated_answer = ask_question(user_question)

# Print the generated answer
print(f"Generated Spark code: {generated_answer}")

Enter your SAS code: proc means data=mydata; run;


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Spark code: No;
