In [1]:
!pip install transformers



In [2]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import os
df = pd.read_csv("servicenow_cleaned.csv")

In [3]:
rows_per_group = 100
df_sample = (
    df.groupby('assignment_group',group_keys=False)
    .apply(lambda x: x.sample(min(len(x),rows_per_group),random_state=42))
    .reset_index(drop=True)
)
print("sample created with shape:",df_sample.shape)
df_sample['assignment_group'].value_counts().head()

sample created with shape: (2312, 23)


  .apply(lambda x: x.sample(min(len(x),rows_per_group),random_state=42))


assignment_group
HCL-AVD-Citrix Support    100
HCL-GSD-English           100
HCL-RHYTHM                100
HCL-SCCM                  100
HCL-Service Now           100
Name: count, dtype: int64

In [4]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use cpu


In [5]:
text_data = df_sample[['incident_number','summary','description','resolution_notes']]

In [7]:
os.makedirs("checkpoints",exist_ok=True)
checkpoints_path = "checkpoints/servicenow_ai_summary.csv"
if 'ai_summary' not in df_sample.columns:
    df_sample['ai_summary'] = None
# Summarization function
def generate_summary(row):
    text = (str(row['summary']) + " " + str(row['description'])+ " " + str(row['resolution_notes'])).strip()
    if len(text) < 50:
        return text  # skip short text
    try:
        result = summarizer(text, max_length=50, min_length=20, do_sample=False)
        return result[0]['summary_text']
    except Exception as e:
        return f"Error: {e}"

# Progress with auto-save
tqdm.pandas()
for i in tqdm(range(len(df_sample))):
    if pd.isnull(df_sample.loc[i, 'ai_summary']):
        df_sample.loc[i, 'ai_summary'] = generate_summary(df_sample.loc[i])
    
    # Save checkpoint every 200 rows
    if i % 200 == 0 and i != 0:
        df_sample.to_csv(checkpoints_path, index=False)
        print(f"✅ Checkpoint saved at row {i}")

# Save final output
df_sample.to_excel("servicenow_ai_summary_final.xlsx", index=False)
print("🎉 Summarization completed and file saved!")

  9%|██████▊                                                                       | 201/2312 [00:00<00:04, 463.39it/s]

✅ Checkpoint saved at row 200


  9%|███████                                                                        | 205/2312 [00:59<19:29,  1.80it/s]Your max_length is set to 50, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
 10%|███████▋                                                                     | 232/2312 [06:39<6:55:11, 11.98s/it]Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
 11%|████████▌                                                                    | 256/2312 [12:11<9:58:21, 17.46s/it]Your max_length is set to 50, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typical

✅ Checkpoint saved at row 400


 18%|█████████████▊                                                               | 415/2312 [53:20<8:16:00, 15.69s/it]Your max_length is set to 50, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
 18%|██████████████                                                               | 422/2312 [54:19<4:08:31,  7.89s/it]Your max_length is set to 50, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
 18%|██████████████▏                                                              | 426/2312 [55:03<5:53:27, 11.24s/it]Your max_length is set to 50, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 600


 26%|███████████████████▌                                                       | 603/2312 [1:27:24<4:41:16,  9.88s/it]Your max_length is set to 50, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
 26%|███████████████████▌                                                       | 604/2312 [1:27:31<4:20:25,  9.15s/it]Your max_length is set to 50, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
 26%|███████████████████▋                                                       | 605/2312 [1:27:38<4:02:57,  8.54s/it]Your max_length is set to 50, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 800


 35%|██████████████████████████▍                                                | 814/2312 [2:08:13<5:45:57, 13.86s/it]Your max_length is set to 50, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
 38%|████████████████████████████▊                                              | 887/2312 [2:22:54<5:05:19, 12.86s/it]Your max_length is set to 50, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
 38%|████████████████████████████▊                                              | 890/2312 [2:23:20<4:01:50, 10.20s/it]Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 1000


 44%|████████████████████████████████▎                                         | 1009/2312 [2:43:45<3:29:01,  9.62s/it]Your max_length is set to 50, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
 44%|████████████████████████████████▎                                         | 1011/2312 [2:44:04<3:32:10,  9.79s/it]Your max_length is set to 50, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
 44%|████████████████████████████████▌                                         | 1016/2312 [2:44:50<3:22:17,  9.37s/it]Your max_length is set to 50, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 1200


 54%|███████████████████████████████████████▋                                  | 1239/2312 [3:20:42<3:23:47, 11.40s/it]Your max_length is set to 50, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
 54%|███████████████████████████████████████▊                                  | 1245/2312 [3:21:32<2:29:09,  8.39s/it]Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
 54%|████████████████████████████████████████▏                                 | 1255/2312 [3:23:12<3:00:11, 10.23s/it]Your max_length is set to 50, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 1400


 64%|███████████████████████████████████████████████▋                          | 1488/2312 [4:07:52<3:21:44, 14.69s/it]Your max_length is set to 50, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
 69%|███████████████████████████████████████████████████▏                      | 1601/2312 [4:29:25<2:14:24, 11.34s/it]

✅ Checkpoint saved at row 1600


 72%|█████████████████████████████████████████████████████▌                    | 1675/2312 [4:41:42<1:46:28, 10.03s/it]Your max_length is set to 50, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
 72%|█████████████████████████████████████████████████████▋                    | 1676/2312 [4:41:49<1:35:07,  8.97s/it]Your max_length is set to 50, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
 73%|█████████████████████████████████████████████████████▊                    | 1682/2312 [4:42:55<2:05:44, 11.97s/it]Your max_length is set to 50, but your input_length is only 40. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 1800


 78%|█████████████████████████████████████████████████████████▊                | 1805/2312 [5:02:23<1:14:36,  8.83s/it]Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
 78%|█████████████████████████████████████████████████████████▊                | 1806/2312 [5:02:31<1:12:07,  8.55s/it]Your max_length is set to 50, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
 78%|█████████████████████████████████████████████████████████▊                | 1807/2312 [5:02:39<1:09:09,  8.22s/it]Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 2000


 88%|██████████████████████████████████████████████████████████████████▋         | 2027/2312 [5:39:20<48:01, 10.11s/it]Your max_length is set to 50, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
 90%|████████████████████████████████████████████████████████████████████▍       | 2082/2312 [5:50:11<44:32, 11.62s/it]Your max_length is set to 50, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
 91%|█████████████████████████████████████████████████████████████████████▍      | 2111/2312 [5:55:35<39:23, 11.76s/it]Your max_length is set to 50, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typica

✅ Checkpoint saved at row 2200


 97%|█████████████████████████████████████████████████████████████████████████▌  | 2239/2312 [6:15:01<10:57,  9.00s/it]Your max_length is set to 50, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
 97%|█████████████████████████████████████████████████████████████████████████▋  | 2240/2312 [6:15:08<10:10,  8.48s/it]Your max_length is set to 50, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
 97%|█████████████████████████████████████████████████████████████████████████▊  | 2244/2312 [6:15:39<09:41,  8.56s/it]Your max_length is set to 50, but your input_length is only 27. Since this is a summarization task, where outputs shorter than the input are typica

🎉 Summarization completed and file saved!
